check, if any necessary packages are missing

list.of.packages <- c("dplyr", "compositions", "finalfit", "stats", "lmtest")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
tbdar = read.delim("~/Sinergia/Analysis/Human/Ancestry/Data/interim/metadata_Sinergia_final_all_pats_morePopulations.txt",
                  sep = "\t")



tbdar$Introduction <- ifelse(tbdar$Introduction %in% c("Intro 1", "Intro 5", "Intro 9", "Intro 10", "other"), tbdar$Introduction, NA)

tbdar$Intro10 <- ifelse(tbdar$Introduction == "Intro 10", 1, ifelse(tbdar$Introduction %in% c("Intro 1", "Intro 5", "Intro 9", "other"), 0, NA))
# create category for TB score
tbdar$TB_score_category <- ifelse(tbdar$TB_score < 6, "mild", ifelse(tbdar$TB_score > 5 & tbdar$TB_score < 8, "moderate", ifelse(tbdar$TB_score > 7, "severe", "somethingelse")))
tbdar$TB_score_2cat <- ifelse(tbdar$TB_score_category == "mild", "mild", ifelse(tbdar$TB_score_category %in% c("moderate", "severe"), "severe", NA))

tbdar$TB_category <- ifelse(tbdar$TB_category == "relapse", "relapse", ifelse(tbdar$TB_category == "new", "new", NA))
tbdar$Intro10 <- as.factor(tbdar$Intro10)
tbdar$DR_status <- ifelse(tbdar$DR_status %in% c("INH-Mono", "MDR", "Susceptible"), tbdar$DR_status, NA)
# age
hist(tbdar$age)

tbdar$age_cat <- ifelse(tbdar$age < 30, 1, ifelse(tbdar$age >= 30 & tbdar$age < 40, 2, ifelse(tbdar$age >= 40 & tbdar$age < 50, 3, ifelse(tbdar$age >= 50, 4, NA))))

# add whether they are patients or control

tbdar$Type <- ifelse(tbdar$PATIENT_ID >= 83000 & !tbdar$Lineage %in% c("L1", "L2", "L4", "L3"), "control", "patient")

# make a column containing the common tribes
tbdar$common_tribes <- ifelse(unname(table(tbdar$patient_tribe)[tbdar$patient_tribe]) > 100, tbdar$patient_tribe, "other") # at least 100 patients with that tribe

tbdar$socio_cat <- ifelse(tbdar$socioeconomic_status < 100, 1, ifelse(tbdar$socioeconomic_status >= 100 & tbdar$socioeconomic_status < 200, 2, ifelse(tbdar$socioeconomic_status >= 200, 3, NA)))
# create a category for X-ray score:
tbdar$Xray_score_category_ralph <- ifelse(tbdar$Xray_score < 71, "mild", ifelse(tbdar$Xray_score >= 71, "severe", NA))
tbdar$Xray_01 <- ifelse(tbdar$Xray_score_category_ralph == "mild", 0, ifelse(tbdar$Xray_score_category_ralph == "severe", 1, NA))
unique(tbdar$Xray_score_category_ralph)
## [1] NA       "mild"   "severe"
tbdar$Xray_score_category_ralph <- as.factor(tbdar$Xray_score_category_ralph)
# categorize the cough duration

tbdar$cough_cat <- ifelse(tbdar$symptoms_duration_cough_duration >= 4, 3, ifelse(tbdar$symptoms_duration_cough_duration < 4 & tbdar$symptoms_duration_cough_duration >= 3, 2, ifelse(tbdar$symptoms_duration_cough_duration < 3 & tbdar$symptoms_duration_cough_duration >= 0, 1, NA)))
# modify the empty HIV status to NA
tbdar$HIV_status <- ifelse(tbdar$HIV_status %in% c("infected", "positive"), "infected", ifelse(tbdar$HIV_status == "negative", "negative", NA))
unique(tbdar$HIV_status)
## [1] "negative" "infected" NA
tbdar$TB_RF_smoking <- ifelse(tbdar$TB_RF_smoking == "no", "no", ifelse(tbdar$TB_RF_smoking == "yes", "yes", NA))
# add whether they are patients or control
# categories for age
tbdar$age_cat <- ifelse(tbdar$age < 30, 1, ifelse(tbdar$age >= 30 & tbdar$age < 40, 2, ifelse(tbdar$age >= 40 & tbdar$age < 50, 3, ifelse(tbdar$age >= 50, 4, NA))))

tbdar$DR_category <- ifelse(tbdar$DR_status %in% c("INH-Mono", "MDR"), "resistant", ifelse(tbdar$DR_status =="Susceptible", "susceptible", NA))
# turn all of them into a factor:
tbdar$TB_score_category <- as.factor(tbdar$TB_score_category)
tbdar$Xray_score_category <- as.factor(tbdar$Xray_score_category)
tbdar$patient_sex <- as.factor(tbdar$patient_sex)
tbdar$HIV_status <- as.factor(tbdar$HIV_status)
tbdar$TB_RF_smoking <- as.factor(tbdar$TB_RF_smoking)
tbdar$Xray_01 <- as.factor(tbdar$Xray_01)
tbdar$general_examination_malnutrition <- as.factor(tbdar$general_examination_malnutrition)
tbdar$Education <- as.factor(tbdar$Education)
tbdar$Type <- as.factor(tbdar$Type)
tbdar$DR_status <- as.factor(tbdar$DR_status)
tbdar$DR_category <- as.factor(tbdar$DR_category)
tbdar$age_cat <- as.factor(tbdar$age_cat)
tbdar$cough_cat <- as.factor(tbdar$cough_cat)
tbdar$TB_score_2cat <- as.factor(tbdar$TB_score_2cat)
tbdar$TB_category <- as.factor(tbdar$TB_category)
# relevel some factors:
tbdar$patient_sex <- relevel(tbdar$patient_sex, "male")
tbdar$HIV_status <- relevel(tbdar$HIV_status, "negative")
tbdar$Xray_01 <- relevel(tbdar$Xray_01, 1)
tbdar$TB_score_2cat <- relevel(tbdar$TB_score_2cat, "severe")
tbdar$DR_status <- relevel(tbdar$DR_status, "Susceptible")
tbdar$DR_category <- relevel(tbdar$DR_category, "susceptible")
tbdar$Education <- relevel(tbdar$Education, "primary") # most patients have primary education
tbdar$general_examination_malnutrition <- relevel(tbdar$general_examination_malnutrition, "no")
tbdar$other <- 1-(tbdar$Western_Bantu + tbdar$Southeastern_Bantu + tbdar$Eastern_Bantu)
tbdar$socio_cat <- as.factor(tbdar$socio_cat)

create the tables shown in the paper

Table 1

# Table 1 Manuscript
library(finalfit)
library(readr)
# only include patients with either a bacterial or a human genome

tbdar_subset <- subset(tbdar, Introduction %in% c("other", "Intro 1", "Intro 5", "Intro 9", "Intro 10") | Nigerian > 0, select=c("Xray_01", "Introduction",  "Southeastern_Bantu", "Eastern_Bantu", "Western_Bantu", "other"))

explanatory = c("Introduction", "Southeastern_Bantu", "Eastern_Bantu", "Western_Bantu",  "other")
dependent = "Xray_01"
tbdar_subset %>%
    summary_factorlist(dependent = dependent, explanatory,  add_col_totals = T, add_row_totals = T, include_row_missing_col = T, p = F, na_include_dependent = TRUE) -> t1


t1
##               label    Total N Missing N    levels          0         1
##         Total N (%)                                849 (44.6) 177 (9.3)
##        Introduction 764 (74.5)       262   Intro 1   28 (4.4)   7 (5.3)
##                                           Intro 10 239 (37.8) 59 (44.7)
##                                            Intro 5   38 (6.0)   4 (3.0)
##                                            Intro 9   58 (9.2)  11 (8.3)
##                                              other 269 (42.6) 51 (38.6)
##  Southeastern_Bantu 840 (81.9)       186 Mean (SD)  0.4 (0.2) 0.5 (0.1)
##       Eastern_Bantu 840 (81.9)       186 Mean (SD)  0.2 (0.1) 0.2 (0.1)
##       Western_Bantu 840 (81.9)       186 Mean (SD)  0.1 (0.1) 0.1 (0.1)
##               other 840 (81.9)       186 Mean (SD)  0.2 (0.1) 0.2 (0.1)
##   (Missing)
##  878 (46.1)
##    36 (5.1)
##  274 (38.8)
##    44 (6.2)
##    60 (8.5)
##  293 (41.4)
##   0.4 (0.2)
##   0.2 (0.1)
##   0.1 (0.1)
##   0.2 (0.1)
# Table 1 Manuscript
library(finalfit)
library(readr)
# only include patients with either a bacterial or a human genome

tbdar_subset <- subset(tbdar, Introduction %in% c("other", "Intro 1", "Intro 5", "Intro 9", "Intro 10") | Nigerian > 0, select=c("TB_score_2cat", "Introduction",  "Southeastern_Bantu", "Eastern_Bantu", "Western_Bantu", "other"))

explanatory = c("Introduction", "Southeastern_Bantu", "Eastern_Bantu", "Western_Bantu",  "other")
dependent = "TB_score_2cat"
tbdar_subset %>%
    summary_factorlist(dependent = dependent, explanatory,  add_col_totals = T, add_row_totals = T, include_row_missing_col = T, p = F, na_include_dependent = TRUE) -> t1

t1
##               label     Total N Missing N    levels     severe        mild
##         Total N (%)                                 624 (32.8) 1280 (67.2)
##        Introduction 1471 (77.3)       433   Intro 1   22 (4.5)    49 (5.0)
##                                            Intro 10 184 (37.6)  388 (39.6)
##                                             Intro 5   26 (5.3)    60 (6.1)
##                                             Intro 9  51 (10.4)    78 (8.0)
##                                               other 207 (42.2)  406 (41.4)
##  Southeastern_Bantu 1442 (75.7)       462 Mean (SD)  0.4 (0.2)   0.4 (0.2)
##       Eastern_Bantu 1442 (75.7)       462 Mean (SD)  0.2 (0.1)   0.2 (0.1)
##       Western_Bantu 1442 (75.7)       462 Mean (SD)  0.1 (0.1)   0.1 (0.1)
##               other 1442 (75.7)       462 Mean (SD)  0.2 (0.1)   0.2 (0.1)
##  (Missing)
##    0 (0.0)
##           
##           
##           
##           
##           
##           
##           
##           
## 
# Table 1 Manuscript
library(finalfit)
library(readr)
# only include patients with either a bacterial or a human genome

tbdar_subset <- subset(tbdar, Introduction %in% c("other", "Intro 1", "Intro 5", "Intro 9", "Intro 10") | Nigerian > 0, select=c("Ct_value", "Introduction",  "Southeastern_Bantu", "Eastern_Bantu", "Western_Bantu", "other"))

explanatory = c("Introduction", "Southeastern_Bantu", "Eastern_Bantu", "Western_Bantu",  "other")
dependent = "Ct_value"
tbdar_subset %>%
    summary_factorlist(dependent = dependent, explanatory,  add_row_totals = T, include_row_missing_col = T,  p = F, na_include_dependent = TRUE) -> t1


t1
##               label    Total N Missing N    levels      unit      value
##        Introduction 863 (78.3)       239   Intro 1 Mean (sd) 20.5 (5.1)
##                                           Intro 10 Mean (sd) 19.1 (4.7)
##                                            Intro 5 Mean (sd) 19.5 (4.1)
##                                            Intro 9 Mean (sd) 19.1 (4.7)
##                                              other Mean (sd) 19.4 (4.9)
##  Southeastern_Bantu 810 (73.5)       292 [0.0,0.7] Mean (sd) 19.9 (5.2)
##       Eastern_Bantu 810 (73.5)       292 [0.1,0.4] Mean (sd) 19.9 (5.2)
##       Western_Bantu 810 (73.5)       292 [0.0,0.4] Mean (sd) 19.9 (5.2)
##               other 810 (73.5)       292 [0.1,0.9] Mean (sd) 19.9 (5.2)

Table 2

add some more covariates

library(finalfit)
library(readr)

# only include patients with either a bacterial or a human genome

tbdar_subset <- subset(tbdar, Introduction %in% c("other", "Intro 1", "Intro 5", "Intro 9", "Intro 10") | Nigerian > 0, select=c("Introduction", "patient_sex", "age", "HIV_status", "TB_RF_smoking", "Southeastern_Bantu", "Eastern_Bantu", "Western_Bantu", "other",  "symptoms_duration_cough_duration", "socioeconomic_status", "Education", "general_examination_malnutrition", "TB_category", "DR_status"))

explanatory = c("patient_sex", "age", "HIV_status", "TB_RF_smoking", "symptoms_duration_cough_duration", "socioeconomic_status","Education", "general_examination_malnutrition", "TB_category", "DR_status", "Southeastern_Bantu", "Eastern_Bantu", "Western_Bantu",  "other")
dependent = "Introduction"
tbdar_subset %>%
    summary_factorlist(dependent = dependent, explanatory,  add_col_totals = T, add_row_totals = T, include_row_missing_col = T,  p = F, na_include_dependent = TRUE) -> t1

t1
##                             label      Total N Missing N      levels
##                       Total N (%)                                   
##                       patient_sex 1471 (100.0)         0        male
##                                                               female
##                               age 1471 (100.0)         0   Mean (SD)
##                        HIV_status  1452 (98.7)        19    negative
##                                                             infected
##                     TB_RF_smoking  1462 (99.4)         9          no
##                                                                  yes
##  symptoms_duration_cough_duration  1454 (98.8)        17   Mean (SD)
##              socioeconomic_status 1471 (100.0)         0   Mean (SD)
##                         Education 1471 (100.0)         0     primary
##                                                                   no
##                                                            secondary
##                                                           university
##  general_examination_malnutrition 1471 (100.0)         0          no
##                                                                  yes
##                       TB_category  1447 (98.4)        24         new
##                                                              relapse
##                         DR_status 1471 (100.0)         0 Susceptible
##                                                             INH-Mono
##                                                                  MDR
##                Southeastern_Bantu  1009 (68.6)       462   Mean (SD)
##                     Eastern_Bantu  1009 (68.6)       462   Mean (SD)
##                     Western_Bantu  1009 (68.6)       462   Mean (SD)
##                             other  1009 (68.6)       462   Mean (SD)
##        Intro 1      Intro 10     Intro 5      Intro 9         other
##       71 (3.7)    572 (30.0)    86 (4.5)    129 (6.8)    613 (32.2)
##      52 (73.2)    425 (74.3)   55 (64.0)    87 (67.4)    424 (69.2)
##      19 (26.8)    147 (25.7)   31 (36.0)    42 (32.6)    189 (30.8)
##     32.0 (8.4)   34.6 (10.5) 33.8 (11.0)  36.0 (12.4)   34.6 (10.4)
##      60 (85.7)    462 (81.8)   67 (77.9)    95 (75.4)    515 (85.1)
##      10 (14.3)    103 (18.2)   19 (22.1)    31 (24.6)     90 (14.9)
##      52 (74.3)    411 (72.5)   74 (87.1)   103 (79.8)    482 (78.9)
##      18 (25.7)    156 (27.5)   11 (12.9)    26 (20.2)    129 (21.1)
##      3.7 (2.0)     3.8 (1.8)   3.8 (2.0)    3.7 (1.7)     3.7 (1.9)
##  154.7 (415.7) 123.5 (240.3) 99.7 (75.2) 103.0 (85.1) 113.1 (162.6)
##      54 (76.1)    390 (68.2)   61 (70.9)    92 (71.3)    391 (63.8)
##        5 (7.0)     76 (13.3)   10 (11.6)    15 (11.6)     90 (14.7)
##      12 (16.9)     90 (15.7)   13 (15.1)    18 (14.0)    105 (17.1)
##        0 (0.0)      16 (2.8)     2 (2.3)      4 (3.1)      27 (4.4)
##      62 (87.3)    501 (87.6)   75 (87.2)   116 (89.9)    530 (86.5)
##       9 (12.7)     71 (12.4)   11 (12.8)    13 (10.1)     83 (13.5)
##      67 (94.4)    539 (96.8)   83 (97.6)   125 (97.7)    597 (98.5)
##        4 (5.6)      18 (3.2)     2 (2.4)      3 (2.3)       9 (1.5)
##      70 (98.6)    556 (97.2)   80 (93.0)   119 (92.2)    574 (93.6)
##        1 (1.4)      16 (2.8)     6 (7.0)     10 (7.8)      37 (6.0)
##        0 (0.0)       0 (0.0)     0 (0.0)      0 (0.0)       2 (0.3)
##      0.4 (0.1)     0.5 (0.2)   0.5 (0.2)    0.5 (0.1)     0.4 (0.2)
##      0.2 (0.1)     0.2 (0.1)   0.2 (0.1)    0.2 (0.1)     0.2 (0.1)
##      0.1 (0.1)     0.1 (0.1)   0.1 (0.1)    0.1 (0.1)     0.1 (0.1)
##      0.2 (0.1)     0.2 (0.1)   0.2 (0.1)    0.2 (0.1)     0.3 (0.1)
##      (Missing)
##     433 (22.7)
##     302 (69.7)
##     131 (30.3)
##    35.9 (11.4)
##     330 (77.3)
##      97 (22.7)
##     334 (77.5)
##      97 (22.5)
##      3.6 (1.9)
##  111.6 (123.7)
##     273 (63.0)
##      76 (17.6)
##      64 (14.8)
##       20 (4.6)
##     377 (87.1)
##      56 (12.9)
##     416 (97.2)
##       12 (2.8)
##        0 (NaN)
##        0 (NaN)
##        0 (NaN)
##      0.4 (0.2)
##      0.2 (0.1)
##      0.1 (0.1)
##      0.3 (0.1)

create a supplementary table comparing patients to controls

library(finalfit)
library(readr)

# 

tbdar_subset <- subset(tbdar, select=c("patient_sex", "age", "HIV_status", "TB_RF_smoking", "symptoms_duration_cough_duration", "socioeconomic_status", "Education", "general_examination_malnutrition", "Type", "common_tribes"))
# the variable smoking was not recorded in the controls, 
explanatory = c("patient_sex", "age", "HIV_status", "TB_RF_smoking", "symptoms_duration_cough_duration", "socioeconomic_status","Education", "general_examination_malnutrition", "common_tribes")
dependent = "Type"
tbdar_subset %>%
    summary_factorlist(dependent = dependent, explanatory,  add_col_totals = T, add_row_totals = T, include_row_missing_col = T,  p = T) -> t1

t1
##                             label      Total N Missing N     levels
##                       Total N (%)                                  
##                       patient_sex 3101 (100.0)         0       male
##                                                              female
##                               age 3101 (100.0)         0  Mean (SD)
##                        HIV_status  3031 (97.7)        70   negative
##                                                            infected
##                     TB_RF_smoking  3090 (99.6)        11         no
##                                                                 yes
##  symptoms_duration_cough_duration  2590 (83.5)       511  Mean (SD)
##              socioeconomic_status 3101 (100.0)         0  Mean (SD)
##                         Education 3101 (100.0)         0    primary
##                                                                  no
##                                                           secondary
##                                                          university
##  general_examination_malnutrition 3101 (100.0)         0         no
##                                                                 yes
##                     common_tribes 3101 (100.0)         0      Chaga
##                                                             Makonde
##                                                             Matumbi
##                                                               Mwera
##                                                          Ndengereko
##                                                              Ngindo
##                                                               Ngoni
##                                                              Zaramo
##                                                               other
##       control       patient      p
##    962 (31.0)   2139 (69.0)       
##    610 (63.4)   1513 (70.7) <0.001
##    352 (36.6)    626 (29.3)       
##   37.1 (10.7)   34.9 (10.8) <0.001
##    753 (81.4)   1690 (80.2)  0.488
##    172 (18.6)    416 (19.8)       
##    626 (65.1)   1648 (77.4) <0.001
##    336 (34.9)    480 (22.6)       
##     3.1 (1.6)     3.7 (1.8) <0.001
##  86.8 (132.5) 114.3 (182.7) <0.001
##    648 (67.4)   1420 (66.4)  0.009
##    167 (17.4)    303 (14.2)       
##    123 (12.8)    335 (15.7)       
##      24 (2.5)      81 (3.8)       
##    934 (97.1)   1862 (87.1) <0.001
##      28 (2.9)    277 (12.9)       
##      27 (2.8)      93 (4.3)  0.003
##      58 (6.0)     170 (7.9)       
##      33 (3.4)      84 (3.9)       
##      44 (4.6)     122 (5.7)       
##    174 (18.1)    323 (15.1)       
##      34 (3.5)      70 (3.3)       
##      36 (3.7)      67 (3.1)       
##    149 (15.5)    247 (11.5)       
##    407 (42.3)    963 (45.0)
write_delim( t1,"/scicore/home/gagneux/zwymic00/Sinergia/Analysis/Human/Ancestry/Data/Outputs/Files/supplementary_table_controls.txt", delim = "\t")

adjusting for the compositional variables and categorizing where necessary

library(compositions)
## Welcome to compositions, a package for compositional data analysis.
## Find an intro with "? compositions"
## 
## Attaching package: 'compositions'
## The following objects are masked from 'package:stats':
## 
##     anova, cor, cov, dist, var
## The following object is masked from 'package:graphics':
## 
##     segments
## The following objects are masked from 'package:base':
## 
##     %*%, norm, scale, scale.default
# use the additive log ratio transformation to handle the ancestries since they are compositional variables (sum up to 1)

alr_data <- alr(tbdar[,c("Western_Bantu", "Southeastern_Bantu", "Eastern_Bantu","other")])
colnames(alr_data) <- c("Western_Bantu_alr", "Southeastern_Bantu_alr", "Eastern_Bantu_alr")
new_data <- cbind(tbdar, alr_data)

build the model for X-ray score ending up with a model including the interaction

from now on, exclude HIV-positive patients

put the ancestries into bins and check, whether there is a linear relationship

library(ggplot2)
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
for (anc in c("Western_Bantu_alr", "Southeastern_Bantu_alr", "Eastern_Bantu_alr")){
  hist(new_data[, anc], xlab = anc, main = "")
}

# make some categories

new_data$Western_alr_cat <- ifelse(new_data$Western_Bantu_alr >= -1, 4, ifelse(new_data$Western_Bantu_alr < -1 & new_data$Western_Bantu_alr > -2, 3, ifelse(new_data$Western_Bantu_alr <= -2 & new_data$Western_Bantu_alr > -6, 2,  ifelse(new_data$Western_Bantu_alr <= -6, 1, NA))))

new_data$Eastern_alr_cat <- ifelse(new_data$Eastern_Bantu_alr >= 0, 3, ifelse(new_data$Eastern_Bantu_alr < 0 & new_data$Eastern_Bantu_alr > -0.5, 2, ifelse(new_data$Eastern_Bantu_alr <= -0.5 , 1, NA)))
new_data$Southeastern_alr_cat <- ifelse(new_data$Southeastern_Bantu_alr >= 1, 3, ifelse(new_data$Southeastern_Bantu_alr < 1 & new_data$Southeastern_Bantu_alr > 0, 2, ifelse(new_data$Southeastern_Bantu_alr <= 0 , 1, NA)))


for (anc in c("Western", "Southeastern", "Eastern")){
  print(anc)
  print(table(new_data[,paste0(anc, "_alr_cat")]))
}
## [1] "Western"
## 
##   1   2   3   4 
## 104 211 488 639 
## [1] "Southeastern"
## 
##   1   2   3 
## 313 603 526 
## [1] "Eastern"
## 
##   1   2   3 
## 187 646 609
# how is the relationship between the outcome variables and the socioeconomic status?
clean <- na.omit(new_data[new_data$HIV_status == "negative", c("Xray_01", "socioeconomic_status")])
test <- glm(Xray_01 ~ socioeconomic_status, data = clean, na.action = na.exclude, family = binomial)
plot(test) # linearity looks fine to me according to https://bookdown.org/egarpor/PM-UC3M/glm-diagnostics.html Figure 5.14

clean <- na.omit(new_data[new_data$HIV_status == "negative", c("TB_score_2cat", "socioeconomic_status")])
test <- glm(TB_score_2cat ~ socioeconomic_status, data = clean, na.action = na.exclude, family = binomial)
plot(test)

clean <- na.omit(new_data[new_data$HIV_status == "negative", c("Ct_value", "socioeconomic_status")])
test <- glm(log10(Ct_value) ~ socioeconomic_status, data = clean, na.action = na.exclude, family = gaussian)
plot(test)

# make some test, to check, whether the assumptions of the model are fulfilled

# test the linearity on the log-odds scale
# do a Box-Tidwell test to see, whether the log transformed continuous variables have a significant interaction with themselves

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
# X-ray score
clean <- na.omit(new_data[new_data$HIV_status == "negative", c("Western_alr_cat", "Southeastern_alr_cat", "Eastern_alr_cat", "patient_sex", "age_cat", "symptoms_duration_cough_duration", "Intro10", "TB_RF_smoking", "Xray_01", "Western_Bantu_alr", "Southeastern_Bantu_alr", "Eastern_Bantu_alr", "cough_cat", "socioeconomic_status", "TB_category", "general_examination_malnutrition", "Education", "DR_category")])
test <- glm(Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + socioeconomic_status + socioeconomic_status:log(socioeconomic_status) + general_examination_malnutrition + Education + TB_category + DR_category, data = clean, family = binomial, na.action = na.exclude)
logodds <- test$linear.predictors
summary(test)
## 
## Call:
## glm(formula = Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + socioeconomic_status + 
##     socioeconomic_status:log(socioeconomic_status) + general_examination_malnutrition + 
##     Education + TB_category + DR_category, family = binomial, 
##     data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                                 Estimate Std. Error z value
## (Intercept)                                    -1.714074   0.896771  -1.911
## as.factor(Western_alr_cat)2                    -0.303688   0.534375  -0.568
## as.factor(Western_alr_cat)3                    -0.648030   0.489604  -1.324
## as.factor(Western_alr_cat)4                    -0.606521   0.477027  -1.271
## as.factor(Eastern_alr_cat)2                     0.386321   0.489810   0.789
## as.factor(Eastern_alr_cat)3                     0.226144   0.503048   0.450
## as.factor(Southeastern_alr_cat)2                0.867088   0.392235   2.211
## as.factor(Southeastern_alr_cat)3                0.848758   0.418560   2.028
## Intro101                                        0.492607   0.265737   1.854
## as.factor(age_cat)2                            -0.045123   0.324959  -0.139
## as.factor(age_cat)3                            -0.010947   0.382693  -0.029
## as.factor(age_cat)4                            -0.030682   0.562190  -0.055
## patient_sexfemale                              -0.183113   0.313580  -0.584
## TB_RF_smokingyes                               -0.414714   0.346492  -1.197
## as.factor(cough_cat)2                          -0.662846   0.413675  -1.602
## as.factor(cough_cat)3                          -0.040582   0.296698  -0.137
## socioeconomic_status                           -0.018358   0.021685  -0.847
## general_examination_malnutritionyes             0.387367   0.368012   1.053
## Educationno                                     0.056587   0.403341   0.140
## Educationsecondary                              0.161346   0.368946   0.437
## Educationuniversity                            -0.258894   0.664273  -0.390
## TB_categoryrelapse                             -0.075975   1.128605  -0.067
## DR_categoryresistant                           -0.492706   0.788882  -0.625
## socioeconomic_status:log(socioeconomic_status)  0.003068   0.003516   0.873
##                                                Pr(>|z|)  
## (Intercept)                                      0.0560 .
## as.factor(Western_alr_cat)2                      0.5698  
## as.factor(Western_alr_cat)3                      0.1856  
## as.factor(Western_alr_cat)4                      0.2036  
## as.factor(Eastern_alr_cat)2                      0.4303  
## as.factor(Eastern_alr_cat)3                      0.6530  
## as.factor(Southeastern_alr_cat)2                 0.0271 *
## as.factor(Southeastern_alr_cat)3                 0.0426 *
## Intro101                                         0.0638 .
## as.factor(age_cat)2                              0.8896  
## as.factor(age_cat)3                              0.9772  
## as.factor(age_cat)4                              0.9565  
## patient_sexfemale                                0.5593  
## TB_RF_smokingyes                                 0.2313  
## as.factor(cough_cat)2                            0.1091  
## as.factor(cough_cat)3                            0.8912  
## socioeconomic_status                             0.3972  
## general_examination_malnutritionyes              0.2925  
## Educationno                                      0.8884  
## Educationsecondary                               0.6619  
## Educationuniversity                              0.6967  
## TB_categoryrelapse                               0.9463  
## DR_categoryresistant                             0.5323  
## socioeconomic_status:log(socioeconomic_status)   0.3829  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 421.34  on 475  degrees of freedom
## Residual deviance: 400.92  on 452  degrees of freedom
## AIC: 448.92
## 
## Number of Fisher Scoring iterations: 5
# TB-score
clean <- na.omit(new_data[new_data$HIV_status == "negative", c("Western_alr_cat", "Southeastern_alr_cat", "Eastern_alr_cat", "patient_sex", "age_cat", "symptoms_duration_cough_duration", "Intro10", "TB_RF_smoking", "TB_score_2cat", "Western_Bantu_alr", "Southeastern_Bantu_alr", "Eastern_Bantu_alr", "cough_cat", "socioeconomic_status", "TB_category", "general_examination_malnutrition", "Education", "DR_category")])
test <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + socioeconomic_status + socioeconomic_status:log(socioeconomic_status) + general_examination_malnutrition + Education + TB_category + DR_category, data = clean, family = binomial, na.action = na.exclude)
logodds <- test$linear.predictors
summary(test)
## 
## Call:
## glm(formula = TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + socioeconomic_status + 
##     socioeconomic_status:log(socioeconomic_status) + general_examination_malnutrition + 
##     Education + TB_category + DR_category, family = binomial, 
##     data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                                  Estimate Std. Error z value
## (Intercept)                                     0.9064289  0.4756788   1.906
## as.factor(Western_alr_cat)2                     0.0069756  0.3627947   0.019
## as.factor(Western_alr_cat)3                     0.0988675  0.3235889   0.306
## as.factor(Western_alr_cat)4                    -0.0457328  0.3210485  -0.142
## as.factor(Eastern_alr_cat)2                     0.1543690  0.2614055   0.591
## as.factor(Eastern_alr_cat)3                     0.0582153  0.2699886   0.216
## as.factor(Southeastern_alr_cat)2               -0.4788609  0.2183675  -2.193
## as.factor(Southeastern_alr_cat)3                0.0324061  0.2391166   0.136
## Intro101                                        0.0247476  0.1687611   0.147
## as.factor(age_cat)2                             0.2807053  0.1976608   1.420
## as.factor(age_cat)3                            -0.0262999  0.2394222  -0.110
## as.factor(age_cat)4                             0.3161308  0.3435733   0.920
## patient_sexfemale                              -0.2367093  0.1944373  -1.217
## TB_RF_smokingyes                               -0.1470433  0.2084468  -0.705
## as.factor(cough_cat)2                          -0.1339090  0.2477538  -0.540
## as.factor(cough_cat)3                          -0.2093950  0.1976269  -1.060
## socioeconomic_status                            0.0091886  0.0040823   2.251
## general_examination_malnutritionyes            -1.8113481  0.2512825  -7.208
## Educationno                                    -0.1041520  0.2527282  -0.412
## Educationsecondary                             -0.3196979  0.2227096  -1.435
## Educationuniversity                             0.1134083  0.4177544   0.271
## TB_categoryrelapse                             -0.6033336  0.6543214  -0.922
## DR_categoryresistant                            0.4837574  0.4373979   1.106
## socioeconomic_status:log(socioeconomic_status) -0.0011201  0.0005015  -2.234
##                                                Pr(>|z|)    
## (Intercept)                                      0.0567 .  
## as.factor(Western_alr_cat)2                      0.9847    
## as.factor(Western_alr_cat)3                      0.7600    
## as.factor(Western_alr_cat)4                      0.8867    
## as.factor(Eastern_alr_cat)2                      0.5548    
## as.factor(Eastern_alr_cat)3                      0.8293    
## as.factor(Southeastern_alr_cat)2                 0.0283 *  
## as.factor(Southeastern_alr_cat)3                 0.8922    
## Intro101                                         0.8834    
## as.factor(age_cat)2                              0.1556    
## as.factor(age_cat)3                              0.9125    
## as.factor(age_cat)4                              0.3575    
## patient_sexfemale                                0.2234    
## TB_RF_smokingyes                                 0.4805    
## as.factor(cough_cat)2                            0.5889    
## as.factor(cough_cat)3                            0.2894    
## socioeconomic_status                             0.0244 *  
## general_examination_malnutritionyes            5.66e-13 ***
## Educationno                                      0.6803    
## Educationsecondary                               0.1511    
## Educationuniversity                              0.7860    
## TB_categoryrelapse                               0.3565    
## DR_categoryresistant                             0.2687    
## socioeconomic_status:log(socioeconomic_status)   0.0255 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.59  on 826  degrees of freedom
## Residual deviance:  936.16  on 803  degrees of freedom
## AIC: 984.16
## 
## Number of Fisher Scoring iterations: 4
# Ct-value - look at the residual plots
clean <- na.omit(new_data[new_data$HIV_status == "negative", c("Western_alr_cat", "Southeastern_alr_cat", "Eastern_alr_cat", "patient_sex", "age_cat", "symptoms_duration_cough_duration", "Intro10", "TB_RF_smoking", "Ct_value", "Western_Bantu_alr", "Southeastern_Bantu_alr", "Eastern_Bantu_alr", "cough_cat", "socioeconomic_status", "TB_category", "general_examination_malnutrition", "Education", "DR_category")])
test <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + socioeconomic_status + general_examination_malnutrition + Education + TB_category + DR_category, data = clean, family = "gaussian", na.action = na.exclude)
plot(test)

–> for TB-score, there is linearity with the socioeconomic status. Thus let us categorize the variable –> added to the beginning already

any multicollinearity between the explanatory variables?

–> seems fine, all VIF values are low…

library(car)

clean <- na.omit(new_data[new_data$HIV_status == "negative", c("Western_alr_cat", "Southeastern_alr_cat", "Eastern_alr_cat", "patient_sex", "age_cat", "symptoms_duration_cough_duration", "Intro10", "TB_RF_smoking", "Xray_01", "Western_Bantu_alr", "Southeastern_Bantu_alr", "Eastern_Bantu_alr", "cough_cat", "socio_cat", "TB_category", "general_examination_malnutrition", "Education", "DR_category")])
test <- glm(Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + general_examination_malnutrition + Education + TB_category + DR_category, data = clean, family = binomial, na.action = na.exclude)
vif(test)
##                                      GVIF Df GVIF^(1/(2*Df))
## as.factor(Western_alr_cat)       1.263581  3        1.039762
## as.factor(Eastern_alr_cat)       1.219243  2        1.050806
## as.factor(Southeastern_alr_cat)  1.287133  2        1.065138
## Intro10                          1.071373  1        1.035071
## as.factor(age_cat)               1.400526  3        1.057747
## patient_sex                      1.224722  1        1.106672
## TB_RF_smoking                    1.252161  1        1.119000
## as.factor(cough_cat)             1.152978  2        1.036228
## socio_cat                        1.270735  2        1.061729
## general_examination_malnutrition 1.149464  1        1.072131
## Education                        1.446129  3        1.063411
## TB_category                      1.036860  1        1.018263
## DR_category                      1.061153  1        1.030123
clean <- na.omit(new_data[new_data$HIV_status == "negative", c("Western_alr_cat", "Southeastern_alr_cat", "Eastern_alr_cat", "patient_sex", "age_cat", "symptoms_duration_cough_duration", "Intro10", "TB_RF_smoking", "TB_score_2cat", "Western_Bantu_alr", "Southeastern_Bantu_alr", "Eastern_Bantu_alr", "cough_cat", "socio_cat", "TB_category", "general_examination_malnutrition", "Education", "DR_category")])
test <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + general_examination_malnutrition + Education + TB_category+ DR_category, data = clean, family = binomial, na.action = na.exclude)
vif(test)
##                                      GVIF Df GVIF^(1/(2*Df))
## as.factor(Western_alr_cat)       1.197967  3        1.030562
## as.factor(Eastern_alr_cat)       1.219483  2        1.050858
## as.factor(Southeastern_alr_cat)  1.270586  2        1.061698
## Intro10                          1.040253  1        1.019928
## as.factor(age_cat)               1.288336  3        1.043129
## patient_sex                      1.229874  1        1.108997
## TB_RF_smoking                    1.310898  1        1.144945
## as.factor(cough_cat)             1.054779  2        1.013422
## socio_cat                        1.180576  2        1.042374
## general_examination_malnutrition 1.080151  1        1.039303
## Education                        1.317486  3        1.047026
## TB_category                      1.043649  1        1.021591
## DR_category                      1.024508  1        1.012180
clean <- na.omit(new_data[new_data$HIV_status == "negative", c("Western_alr_cat", "Southeastern_alr_cat", "Eastern_alr_cat", "patient_sex", "age_cat", "symptoms_duration_cough_duration", "Intro10", "TB_RF_smoking", "TB_score_2cat", "Western_Bantu_alr", "Southeastern_Bantu_alr", "Eastern_Bantu_alr", "cough_cat", "socio_cat", "TB_category", "general_examination_malnutrition", "Education", "DR_category")])
test <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + general_examination_malnutrition + Education + TB_category+ DR_category, data = clean, family = binomial, na.action = na.exclude)
vif(test)
##                                      GVIF Df GVIF^(1/(2*Df))
## as.factor(Western_alr_cat)       1.197967  3        1.030562
## as.factor(Eastern_alr_cat)       1.219483  2        1.050858
## as.factor(Southeastern_alr_cat)  1.270586  2        1.061698
## Intro10                          1.040253  1        1.019928
## as.factor(age_cat)               1.288336  3        1.043129
## patient_sex                      1.229874  1        1.108997
## TB_RF_smoking                    1.310898  1        1.144945
## as.factor(cough_cat)             1.054779  2        1.013422
## socio_cat                        1.180576  2        1.042374
## general_examination_malnutrition 1.080151  1        1.039303
## Education                        1.317486  3        1.047026
## TB_category                      1.043649  1        1.021591
## DR_category                      1.024508  1        1.012180

create a model and look at the relationship

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
clean <- na.omit(new_data[new_data$HIV_status == "negative", c("Western_alr_cat", "Southeastern_alr_cat", "Eastern_alr_cat", "patient_sex", "age_cat", "symptoms_duration_cough_duration", "Intro10", "TB_RF_smoking", "Xray_01", "Western_Bantu_alr", "Southeastern_Bantu_alr", "Eastern_Bantu_alr", "cough_cat", "socio_cat", "general_examination_malnutrition", "TB_category", "Education", "DR_category", "DR_status")])

dim(clean)
## [1] 476  19
model1 <- glm(Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Southeastern_alr_cat) + as.factor(Eastern_alr_cat), data = clean, family = binomial, na.action = na.exclude)
print(summary(model1))
## 
## Call:
## glm(formula = Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Southeastern_alr_cat) + 
##     as.factor(Eastern_alr_cat), family = binomial, data = clean, 
##     na.action = na.exclude)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       -2.1597     0.6315  -3.420 0.000627 ***
## as.factor(Western_alr_cat)2       -0.3371     0.5168  -0.652 0.514234    
## as.factor(Western_alr_cat)3       -0.6551     0.4774  -1.372 0.169960    
## as.factor(Western_alr_cat)4       -0.6004     0.4584  -1.310 0.190308    
## as.factor(Southeastern_alr_cat)2   0.8398     0.3842   2.186 0.028815 *  
## as.factor(Southeastern_alr_cat)3   0.8083     0.3990   2.026 0.042768 *  
## as.factor(Eastern_alr_cat)2        0.4603     0.4780   0.963 0.335524    
## as.factor(Eastern_alr_cat)3        0.3687     0.4877   0.756 0.449669    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 421.34  on 475  degrees of freedom
## Residual deviance: 410.73  on 468  degrees of freedom
## AIC: 426.73
## 
## Number of Fisher Scoring iterations: 5
ORs <- exp(cbind(OR = coef(model1), confint(model1)))
## Waiting for profiling to be done...
round(ORs, digits = 2)
##                                    OR 2.5 % 97.5 %
## (Intercept)                      0.12  0.03   0.37
## as.factor(Western_alr_cat)2      0.71  0.26   2.03
## as.factor(Western_alr_cat)3      0.52  0.21   1.38
## as.factor(Western_alr_cat)4      0.55  0.23   1.42
## as.factor(Southeastern_alr_cat)2 2.32  1.13   5.16
## as.factor(Southeastern_alr_cat)3 2.24  1.06   5.13
## as.factor(Eastern_alr_cat)2      1.58  0.66   4.43
## as.factor(Eastern_alr_cat)3      1.45  0.59   4.11
model2 <- glm(Xray_01 ~ 1, data = clean, family = binomial, na.action = na.exclude)
lrtest(model1, model2)
## Likelihood ratio test
## 
## Model 1: Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Southeastern_alr_cat) + 
##     as.factor(Eastern_alr_cat)
## Model 2: Xray_01 ~ 1
##   #Df  LogLik Df  Chisq Pr(>Chisq)
## 1   8 -205.36                     
## 2   1 -210.67 -7 10.615     0.1563

–> looking at the summary I would say that Western_Bantu and Southeastern_Bantu are not linear, thus use them as categories

# add the introduction
model1 <- glm(Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10, family = binomial, 
##     data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       -2.3495     0.6471  -3.631 0.000283 ***
## as.factor(Western_alr_cat)2       -0.3274     0.5192  -0.631 0.528358    
## as.factor(Western_alr_cat)3       -0.6715     0.4799  -1.399 0.161704    
## as.factor(Western_alr_cat)4       -0.5811     0.4601  -1.263 0.206600    
## as.factor(Eastern_alr_cat)2        0.4926     0.4800   1.026 0.304788    
## as.factor(Eastern_alr_cat)3        0.3711     0.4889   0.759 0.447828    
## as.factor(Southeastern_alr_cat)2   0.8400     0.3843   2.186 0.028841 *  
## as.factor(Southeastern_alr_cat)3   0.8137     0.3995   2.037 0.041658 *  
## Intro101                           0.4231     0.2569   1.647 0.099508 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 421.34  on 475  degrees of freedom
## Residual deviance: 408.05  on 467  degrees of freedom
## AIC: 426.05
## 
## Number of Fisher Scoring iterations: 5
# add the covariates
# + age + patient_sex + TB_RF_smoking + HIV_status + symptoms_duration_cough_duration + socioeconomic_status

# do it one by one

model1 <- glm(Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat), data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat), 
##     family = binomial, data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -2.25815    0.66576  -3.392 0.000694 ***
## as.factor(Western_alr_cat)2      -0.31745    0.52249  -0.608 0.543467    
## as.factor(Western_alr_cat)3      -0.66766    0.48129  -1.387 0.165369    
## as.factor(Western_alr_cat)4      -0.56235    0.46361  -1.213 0.225141    
## as.factor(Eastern_alr_cat)2       0.48207    0.48127   1.002 0.316504    
## as.factor(Eastern_alr_cat)3       0.34687    0.49170   0.705 0.480524    
## as.factor(Southeastern_alr_cat)2  0.84954    0.38488   2.207 0.027292 *  
## as.factor(Southeastern_alr_cat)3  0.80930    0.40150   2.016 0.043832 *  
## Intro101                          0.41722    0.25754   1.620 0.105221    
## as.factor(age_cat)2              -0.19231    0.29821  -0.645 0.519003    
## as.factor(age_cat)3              -0.08697    0.35721  -0.243 0.807645    
## as.factor(age_cat)4              -0.11508    0.53765  -0.214 0.830513    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 421.34  on 475  degrees of freedom
## Residual deviance: 407.62  on 464  degrees of freedom
## AIC: 431.62
## 
## Number of Fisher Scoring iterations: 5
# add the covariates
# + age + patient_sex + TB_RF_smoking + HIV_status + symptoms_duration_cough_duration

# do it one by one
# sex

model1 <- glm(Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex, family = binomial, data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                      -2.23596    0.67979  -3.289   0.0010 **
## as.factor(Western_alr_cat)2      -0.32661    0.52572  -0.621   0.5344   
## as.factor(Western_alr_cat)3      -0.67211    0.48221  -1.394   0.1634   
## as.factor(Western_alr_cat)4      -0.56846    0.46529  -1.222   0.2218   
## as.factor(Eastern_alr_cat)2       0.48185    0.48130   1.001   0.3168   
## as.factor(Eastern_alr_cat)3       0.34952    0.49197   0.710   0.4774   
## as.factor(Southeastern_alr_cat)2  0.84914    0.38479   2.207   0.0273 * 
## as.factor(Southeastern_alr_cat)3  0.80816    0.40145   2.013   0.0441 * 
## Intro101                          0.41424    0.25820   1.604   0.1086   
## as.factor(age_cat)2              -0.19525    0.29878  -0.653   0.5134   
## as.factor(age_cat)3              -0.09040    0.35780  -0.253   0.8005   
## as.factor(age_cat)4              -0.12676    0.54237  -0.234   0.8152   
## patient_sexfemale                -0.04622    0.28508  -0.162   0.8712   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 421.34  on 475  degrees of freedom
## Residual deviance: 407.60  on 463  degrees of freedom
## AIC: 433.6
## 
## Number of Fisher Scoring iterations: 5
# add the covariates
# + age + patient_sex + TB_RF_smoking + symptoms_duration_cough_duration

# do it one by one
# smoking

model1 <- glm(Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking, family = binomial, data = clean, 
##     na.action = na.exclude)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                      -2.15884    0.68463  -3.153  0.00161 **
## as.factor(Western_alr_cat)2      -0.34258    0.52728  -0.650  0.51588   
## as.factor(Western_alr_cat)3      -0.66261    0.48295  -1.372  0.17007   
## as.factor(Western_alr_cat)4      -0.58627    0.46609  -1.258  0.20844   
## as.factor(Eastern_alr_cat)2       0.48106    0.48143   0.999  0.31767   
## as.factor(Eastern_alr_cat)3       0.35184    0.49195   0.715  0.47449   
## as.factor(Southeastern_alr_cat)2  0.83923    0.38548   2.177  0.02947 * 
## as.factor(Southeastern_alr_cat)3  0.80555    0.40193   2.004  0.04505 * 
## Intro101                          0.42846    0.25881   1.655  0.09783 . 
## as.factor(age_cat)2              -0.15738    0.30167  -0.522  0.60188   
## as.factor(age_cat)3              -0.02029    0.36562  -0.055  0.95575   
## as.factor(age_cat)4              -0.08633    0.54464  -0.159  0.87405   
## patient_sexfemale                -0.14149    0.30145  -0.469  0.63880   
## TB_RF_smokingyes                 -0.30645    0.33576  -0.913  0.36139   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 421.34  on 475  degrees of freedom
## Residual deviance: 406.74  on 462  degrees of freedom
## AIC: 434.74
## 
## Number of Fisher Scoring iterations: 5
# add the covariates
# + age + patient_sex + TB_RF_smoking + symptoms_duration_cough_duration

# do it one by one
# cough duration

model1 <- glm(Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat), data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat), family = binomial, 
##     data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                      -1.98630    0.72744  -2.731  0.00632 **
## as.factor(Western_alr_cat)2      -0.35513    0.53145  -0.668  0.50399   
## as.factor(Western_alr_cat)3      -0.66795    0.48668  -1.372  0.16992   
## as.factor(Western_alr_cat)4      -0.61888    0.47083  -1.314  0.18869   
## as.factor(Eastern_alr_cat)2       0.44071    0.48221   0.914  0.36074   
## as.factor(Eastern_alr_cat)3       0.30616    0.49395   0.620  0.53538   
## as.factor(Southeastern_alr_cat)2  0.82196    0.38744   2.122  0.03388 * 
## as.factor(Southeastern_alr_cat)3  0.81739    0.40339   2.026  0.04274 * 
## Intro101                          0.46435    0.26236   1.770  0.07675 . 
## as.factor(age_cat)2              -0.12753    0.30354  -0.420  0.67438   
## as.factor(age_cat)3              -0.04063    0.36800  -0.110  0.91208   
## as.factor(age_cat)4              -0.02895    0.54869  -0.053  0.95792   
## patient_sexfemale                -0.17076    0.30402  -0.562  0.57434   
## TB_RF_smokingyes                 -0.35708    0.33893  -1.054  0.29210   
## as.factor(cough_cat)2            -0.62899    0.40754  -1.543  0.12274   
## as.factor(cough_cat)3            -0.02462    0.28354  -0.087  0.93080   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 421.34  on 475  degrees of freedom
## Residual deviance: 403.69  on 460  degrees of freedom
## AIC: 435.69
## 
## Number of Fisher Scoring iterations: 5
# add the covariates
# + age + patient_sex + TB_RF_smoking + symptoms_duration_cough_duration + socioeconomic_status

# do it one by one
# socioeconomic_status

model1 <- glm(Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + as.factor(socio_cat), data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + as.factor(socio_cat), 
##     family = binomial, data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                      -1.8953739  0.7500866  -2.527   0.0115 *
## as.factor(Western_alr_cat)2      -0.3798857  0.5332104  -0.712   0.4762  
## as.factor(Western_alr_cat)3      -0.6625474  0.4874575  -1.359   0.1741  
## as.factor(Western_alr_cat)4      -0.6029257  0.4719330  -1.278   0.2014  
## as.factor(Eastern_alr_cat)2       0.4266041  0.4826585   0.884   0.3768  
## as.factor(Eastern_alr_cat)3       0.2887815  0.4939081   0.585   0.5588  
## as.factor(Southeastern_alr_cat)2  0.8212903  0.3905007   2.103   0.0355 *
## as.factor(Southeastern_alr_cat)3  0.7903390  0.4101245   1.927   0.0540 .
## Intro101                          0.4759988  0.2631291   1.809   0.0705 .
## as.factor(age_cat)2              -0.1457081  0.3046314  -0.478   0.6324  
## as.factor(age_cat)3              -0.0545067  0.3686680  -0.148   0.8825  
## as.factor(age_cat)4              -0.0655988  0.5519047  -0.119   0.9054  
## patient_sexfemale                -0.1905175  0.3091752  -0.616   0.5378  
## TB_RF_smokingyes                 -0.3585102  0.3398542  -1.055   0.2915  
## as.factor(cough_cat)2            -0.6278634  0.4082305  -1.538   0.1240  
## as.factor(cough_cat)3            -0.0113128  0.2858056  -0.040   0.9684  
## as.factor(socio_cat)2            -0.2295867  0.3040497  -0.755   0.4502  
## as.factor(socio_cat)3            -0.0002465  0.4251929  -0.001   0.9995  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 421.34  on 475  degrees of freedom
## Residual deviance: 403.07  on 458  degrees of freedom
## AIC: 439.07
## 
## Number of Fisher Scoring iterations: 5
# add the covariates
# + age + patient_sex + TB_RF_smoking + symptoms_duration_cough_duration + socioeconomic_status

# do it one by one
# socioeconomic_status

model1 <- glm(Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + as.factor(socio_cat) + general_examination_malnutrition, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + as.factor(socio_cat) + 
##     general_examination_malnutrition, family = binomial, data = clean, 
##     na.action = na.exclude)
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                         -1.922419   0.750680  -2.561   0.0104 *
## as.factor(Western_alr_cat)2         -0.368650   0.533227  -0.691   0.4893  
## as.factor(Western_alr_cat)3         -0.632031   0.487479  -1.297   0.1948  
## as.factor(Western_alr_cat)4         -0.574286   0.471553  -1.218   0.2233  
## as.factor(Eastern_alr_cat)2          0.412149   0.483791   0.852   0.3943  
## as.factor(Eastern_alr_cat)3          0.260222   0.495861   0.525   0.5997  
## as.factor(Southeastern_alr_cat)2     0.825505   0.390612   2.113   0.0346 *
## as.factor(Southeastern_alr_cat)3     0.798272   0.410874   1.943   0.0520 .
## Intro101                             0.506851   0.265083   1.912   0.0559 .
## as.factor(age_cat)2                 -0.123658   0.304764  -0.406   0.6849  
## as.factor(age_cat)3                 -0.066283   0.369894  -0.179   0.8578  
## as.factor(age_cat)4                 -0.079430   0.551961  -0.144   0.8856  
## patient_sexfemale                   -0.189293   0.309471  -0.612   0.5408  
## TB_RF_smokingyes                    -0.412019   0.343685  -1.199   0.2306  
## as.factor(cough_cat)2               -0.658729   0.409745  -1.608   0.1079  
## as.factor(cough_cat)3               -0.077489   0.292086  -0.265   0.7908  
## as.factor(socio_cat)2               -0.273726   0.307454  -0.890   0.3733  
## as.factor(socio_cat)3               -0.007342   0.426167  -0.017   0.9863  
## general_examination_malnutritionyes  0.424484   0.365998   1.160   0.2461  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 421.34  on 475  degrees of freedom
## Residual deviance: 401.78  on 457  degrees of freedom
## AIC: 439.78
## 
## Number of Fisher Scoring iterations: 5
# add the covariates
# + age + patient_sex + TB_RF_smoking + symptoms_duration_cough_duration + socioeconomic_status

# do it one by one
# socioeconomic_status

model1 <- glm(Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + as.factor(socio_cat) + general_examination_malnutrition + TB_category, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + as.factor(socio_cat) + 
##     general_examination_malnutrition + TB_category, family = binomial, 
##     data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                         -1.923386   0.751386  -2.560   0.0105 *
## as.factor(Western_alr_cat)2         -0.368560   0.533237  -0.691   0.4895  
## as.factor(Western_alr_cat)3         -0.631399   0.487935  -1.294   0.1957  
## as.factor(Western_alr_cat)4         -0.573614   0.472085  -1.215   0.2243  
## as.factor(Eastern_alr_cat)2          0.412768   0.484224   0.852   0.3940  
## as.factor(Eastern_alr_cat)3          0.260364   0.495877   0.525   0.5995  
## as.factor(Southeastern_alr_cat)2     0.825633   0.390628   2.114   0.0345 *
## as.factor(Southeastern_alr_cat)3     0.799103   0.411790   1.941   0.0523 .
## Intro101                             0.506798   0.265090   1.912   0.0559 .
## as.factor(age_cat)2                 -0.123199   0.305144  -0.404   0.6864  
## as.factor(age_cat)3                 -0.065940   0.370063  -0.178   0.8586  
## as.factor(age_cat)4                 -0.078477   0.552888  -0.142   0.8871  
## patient_sexfemale                   -0.189591   0.309626  -0.612   0.5403  
## TB_RF_smokingyes                    -0.411991   0.343699  -1.199   0.2306  
## as.factor(cough_cat)2               -0.659180   0.410019  -1.608   0.1079  
## as.factor(cough_cat)3               -0.077258   0.292192  -0.264   0.7915  
## as.factor(socio_cat)2               -0.273270   0.307818  -0.888   0.3747  
## as.factor(socio_cat)3               -0.007818   0.426460  -0.018   0.9854  
## general_examination_malnutritionyes  0.423805   0.366698   1.156   0.2478  
## TB_categoryrelapse                  -0.033689   1.131928  -0.030   0.9763  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 421.34  on 475  degrees of freedom
## Residual deviance: 401.78  on 456  degrees of freedom
## AIC: 441.78
## 
## Number of Fisher Scoring iterations: 5
# add the covariates
# + age + patient_sex + TB_RF_smoking + symptoms_duration_cough_duration + socioeconomic_status

# do it one by one
# socioeconomic_status

model1 <- glm(Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + as.factor(socio_cat)+ general_examination_malnutrition + TB_category + Education, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + as.factor(socio_cat) + 
##     general_examination_malnutrition + TB_category + Education, 
##     family = binomial, data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                         -1.98536    0.75847  -2.618  0.00886 **
## as.factor(Western_alr_cat)2         -0.35939    0.53481  -0.672  0.50159   
## as.factor(Western_alr_cat)3         -0.63664    0.48959  -1.300  0.19348   
## as.factor(Western_alr_cat)4         -0.59621    0.47692  -1.250  0.21125   
## as.factor(Eastern_alr_cat)2          0.40837    0.48580   0.841  0.40056   
## as.factor(Eastern_alr_cat)3          0.25373    0.49876   0.509  0.61095   
## as.factor(Southeastern_alr_cat)2     0.83689    0.39166   2.137  0.03262 * 
## as.factor(Southeastern_alr_cat)3     0.82460    0.41900   1.968  0.04907 * 
## Intro101                             0.51887    0.26629   1.949  0.05135 . 
## as.factor(age_cat)2                 -0.06894    0.32435  -0.213  0.83169   
## as.factor(age_cat)3                 -0.02067    0.38218  -0.054  0.95687   
## as.factor(age_cat)4                 -0.03925    0.55884  -0.070  0.94400   
## patient_sexfemale                   -0.19277    0.31010  -0.622  0.53418   
## TB_RF_smokingyes                    -0.41777    0.34585  -1.208  0.22707   
## as.factor(cough_cat)2               -0.66925    0.41139  -1.627  0.10378   
## as.factor(cough_cat)3               -0.07732    0.29313  -0.264  0.79197   
## as.factor(socio_cat)2               -0.27269    0.31126  -0.876  0.38098   
## as.factor(socio_cat)3                0.01433    0.43520   0.033  0.97372   
## general_examination_malnutritionyes  0.40919    0.37130   1.102  0.27044   
## TB_categoryrelapse                  -0.03779    1.13271  -0.033  0.97339   
## Educationno                          0.03560    0.40310   0.088  0.92963   
## Educationsecondary                   0.19448    0.36959   0.526  0.59875   
## Educationuniversity                 -0.20149    0.66662  -0.302  0.76245   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 421.34  on 475  degrees of freedom
## Residual deviance: 401.35  on 453  degrees of freedom
## AIC: 447.35
## 
## Number of Fisher Scoring iterations: 5
# add the covariates
# + age + patient_sex + TB_RF_smoking + symptoms_duration_cough_duration + socioeconomic_status

# do it one by one
# socioeconomic_status

model1 <- glm(Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + as.factor(socio_cat)+ general_examination_malnutrition + TB_category + Education + DR_status, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + as.factor(socio_cat) + 
##     general_examination_malnutrition + TB_category + Education + 
##     DR_status, family = binomial, data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                          -1.97543    0.75635  -2.612  0.00901 **
## as.factor(Western_alr_cat)2          -0.32864    0.53605  -0.613  0.53982   
## as.factor(Western_alr_cat)3          -0.63461    0.48992  -1.295  0.19521   
## as.factor(Western_alr_cat)4          -0.59465    0.47726  -1.246  0.21277   
## as.factor(Eastern_alr_cat)2           0.37163    0.49035   0.758  0.44851   
## as.factor(Eastern_alr_cat)3           0.22392    0.50438   0.444  0.65707   
## as.factor(Southeastern_alr_cat)2      0.85958    0.39355   2.184  0.02895 * 
## as.factor(Southeastern_alr_cat)3      0.83501    0.42045   1.986  0.04703 * 
## Intro101                              0.50999    0.26642   1.914  0.05559 . 
## as.factor(age_cat)2                  -0.04507    0.32519  -0.139  0.88976   
## as.factor(age_cat)3                  -0.01501    0.38267  -0.039  0.96872   
## as.factor(age_cat)4                  -0.02067    0.55982  -0.037  0.97055   
## patient_sexfemale                    -0.17206    0.31086  -0.553  0.57992   
## TB_RF_smokingyes                     -0.42329    0.34631  -1.222  0.22159   
## as.factor(cough_cat)2                -0.66871    0.41171  -1.624  0.10433   
## as.factor(cough_cat)3                -0.07704    0.29353  -0.262  0.79296   
## as.factor(socio_cat)2                -0.26092    0.31222  -0.836  0.40332   
## as.factor(socio_cat)3                 0.02679    0.43567   0.061  0.95098   
## general_examination_malnutritionyes   0.41065    0.37119   1.106  0.26860   
## TB_categoryrelapse                   -0.04574    1.13199  -0.040  0.96777   
## Educationno                           0.04790    0.40472   0.118  0.90580   
## Educationsecondary                    0.18450    0.36956   0.499  0.61760   
## Educationuniversity                  -0.20602    0.66670  -0.309  0.75731   
## DR_statusINH-Mono                    -0.40393    0.79376  -0.509  0.61083   
## DR_statusMDR                        -13.12837  882.74354  -0.015  0.98813   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 421.34  on 475  degrees of freedom
## Residual deviance: 400.66  on 451  degrees of freedom
## AIC: 450.66
## 
## Number of Fisher Scoring iterations: 13

–> there are too little MDR cases, categorize the DR-status into susceptible and resistant –> adjusted in the beginning…

# add the covariates


model1 <- glm(Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + as.factor(socio_cat)+ general_examination_malnutrition + TB_category + Education + DR_category, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + as.factor(socio_cat) + 
##     general_examination_malnutrition + TB_category + Education + 
##     DR_category, family = binomial, data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                         -1.97107    0.75633  -2.606  0.00916 **
## as.factor(Western_alr_cat)2         -0.33944    0.53612  -0.633  0.52664   
## as.factor(Western_alr_cat)3         -0.63303    0.49004  -1.292  0.19643   
## as.factor(Western_alr_cat)4         -0.59279    0.47735  -1.242  0.21429   
## as.factor(Eastern_alr_cat)2          0.36694    0.49020   0.749  0.45412   
## as.factor(Eastern_alr_cat)3          0.21082    0.50330   0.419  0.67530   
## as.factor(Southeastern_alr_cat)2     0.85868    0.39374   2.181  0.02920 * 
## as.factor(Southeastern_alr_cat)3     0.83781    0.42054   1.992  0.04634 * 
## Intro101                             0.51191    0.26650   1.921  0.05475 . 
## as.factor(age_cat)2                 -0.04971    0.32522  -0.153  0.87852   
## as.factor(age_cat)3                 -0.01532    0.38278  -0.040  0.96807   
## as.factor(age_cat)4                 -0.02374    0.55989  -0.042  0.96618   
## patient_sexfemale                   -0.17864    0.31082  -0.575  0.56547   
## TB_RF_smokingyes                    -0.42442    0.34646  -1.225  0.22057   
## as.factor(cough_cat)2               -0.66239    0.41163  -1.609  0.10757   
## as.factor(cough_cat)3               -0.07016    0.29332  -0.239  0.81097   
## as.factor(socio_cat)2               -0.25823    0.31230  -0.827  0.40831   
## as.factor(socio_cat)3                0.03100    0.43558   0.071  0.94327   
## general_examination_malnutritionyes  0.41154    0.37131   1.108  0.26771   
## TB_categoryrelapse                  -0.04890    1.13224  -0.043  0.96555   
## Educationno                          0.05384    0.40468   0.133  0.89416   
## Educationsecondary                   0.18487    0.36949   0.500  0.61684   
## Educationuniversity                 -0.20094    0.66668  -0.301  0.76311   
## DR_categoryresistant                -0.47988    0.78984  -0.608  0.54348   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 421.34  on 475  degrees of freedom
## Residual deviance: 400.94  on 452  degrees of freedom
## AIC: 448.94
## 
## Number of Fisher Scoring iterations: 5

Compare to a model without the ancestries

having more covariables

Table 3_revisions_morecovariates

library(lmtest)
model1 <- glm(Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + TB_category + general_examination_malnutrition + Education+ DR_category, data = clean, family = binomial, na.action = na.exclude)
model2 <- glm(Xray_01 ~ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat+ TB_category + general_examination_malnutrition + Education+ DR_category, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + 
##     TB_category + general_examination_malnutrition + Education + 
##     DR_category, family = binomial, data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                         -1.97107    0.75633  -2.606  0.00916 **
## as.factor(Western_alr_cat)2         -0.33944    0.53612  -0.633  0.52664   
## as.factor(Western_alr_cat)3         -0.63303    0.49004  -1.292  0.19643   
## as.factor(Western_alr_cat)4         -0.59279    0.47735  -1.242  0.21429   
## as.factor(Eastern_alr_cat)2          0.36694    0.49020   0.749  0.45412   
## as.factor(Eastern_alr_cat)3          0.21082    0.50330   0.419  0.67530   
## as.factor(Southeastern_alr_cat)2     0.85868    0.39374   2.181  0.02920 * 
## as.factor(Southeastern_alr_cat)3     0.83781    0.42054   1.992  0.04634 * 
## Intro101                             0.51191    0.26650   1.921  0.05475 . 
## as.factor(age_cat)2                 -0.04971    0.32522  -0.153  0.87852   
## as.factor(age_cat)3                 -0.01532    0.38278  -0.040  0.96807   
## as.factor(age_cat)4                 -0.02374    0.55989  -0.042  0.96618   
## patient_sexfemale                   -0.17864    0.31082  -0.575  0.56547   
## TB_RF_smokingyes                    -0.42442    0.34646  -1.225  0.22057   
## as.factor(cough_cat)2               -0.66239    0.41163  -1.609  0.10757   
## as.factor(cough_cat)3               -0.07016    0.29332  -0.239  0.81097   
## socio_cat2                          -0.25823    0.31230  -0.827  0.40831   
## socio_cat3                           0.03100    0.43558   0.071  0.94327   
## TB_categoryrelapse                  -0.04890    1.13224  -0.043  0.96555   
## general_examination_malnutritionyes  0.41154    0.37131   1.108  0.26771   
## Educationno                          0.05384    0.40468   0.133  0.89416   
## Educationsecondary                   0.18487    0.36949   0.500  0.61684   
## Educationuniversity                 -0.20094    0.66668  -0.301  0.76311   
## DR_categoryresistant                -0.47988    0.78984  -0.608  0.54348   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 421.34  on 475  degrees of freedom
## Residual deviance: 400.94  on 452  degrees of freedom
## AIC: 448.94
## 
## Number of Fisher Scoring iterations: 5
lrtest(model1, model2)
## Likelihood ratio test
## 
## Model 1: Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + 
##     TB_category + general_examination_malnutrition + Education + 
##     DR_category
## Model 2: Xray_01 ~ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + 
##     as.factor(cough_cat) + socio_cat + TB_category + general_examination_malnutrition + 
##     Education + DR_category
##   #Df  LogLik Df Chisq Pr(>Chisq)
## 1  24 -200.47                    
## 2  17 -205.34 -7 9.739     0.2039
ORs <- exp(cbind(OR = coef(model1), confint(model1)))
## Waiting for profiling to be done...
round(ORs, digits = 2)
##                                       OR 2.5 % 97.5 %
## (Intercept)                         0.14  0.03   0.58
## as.factor(Western_alr_cat)2         0.71  0.25   2.10
## as.factor(Western_alr_cat)3         0.53  0.21   1.45
## as.factor(Western_alr_cat)4         0.55  0.22   1.48
## as.factor(Eastern_alr_cat)2         1.44  0.58   4.11
## as.factor(Eastern_alr_cat)3         1.23  0.49   3.60
## as.factor(Southeastern_alr_cat)2    2.36  1.13   5.35
## as.factor(Southeastern_alr_cat)3    2.31  1.04   5.49
## Intro101                            1.67  0.99   2.81
## as.factor(age_cat)2                 0.95  0.50   1.80
## as.factor(age_cat)3                 0.98  0.46   2.06
## as.factor(age_cat)4                 0.98  0.30   2.76
## patient_sexfemale                   0.84  0.45   1.53
## TB_RF_smokingyes                    0.65  0.32   1.27
## as.factor(cough_cat)2               0.52  0.22   1.12
## as.factor(cough_cat)3               0.93  0.53   1.67
## socio_cat2                          0.77  0.41   1.41
## socio_cat3                          1.03  0.42   2.35
## TB_categoryrelapse                  0.95  0.05   6.48
## general_examination_malnutritionyes 1.51  0.71   3.07
## Educationno                         1.06  0.46   2.26
## Educationsecondary                  1.20  0.57   2.45
## Educationuniversity                 0.82  0.18   2.68
## DR_categoryresistant                0.62  0.09   2.40

add the interaction

with more covariates for table 3 revisions_morecovariates

model1 <- glm(Xray_01 ~ as.factor(Western_alr_cat)* Intro10 + as.factor(Eastern_alr_cat) * Intro10 + as.factor(Southeastern_alr_cat)* Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + TB_category + general_examination_malnutrition + Education+ DR_category, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = Xray_01 ~ as.factor(Western_alr_cat) * Intro10 + 
##     as.factor(Eastern_alr_cat) * Intro10 + as.factor(Southeastern_alr_cat) * 
##     Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + 
##     as.factor(cough_cat) + socio_cat + TB_category + general_examination_malnutrition + 
##     Education + DR_category, family = binomial, data = clean, 
##     na.action = na.exclude)
## 
## Coefficients:
##                                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)                               -2.00831    0.95702  -2.099   0.0359
## as.factor(Western_alr_cat)2               -0.46162    0.80266  -0.575   0.5652
## as.factor(Western_alr_cat)3               -0.14571    0.73497  -0.198   0.8428
## as.factor(Western_alr_cat)4                0.07573    0.70372   0.108   0.9143
## Intro101                                   0.13737    1.52345   0.090   0.9282
## as.factor(Eastern_alr_cat)2               -0.28909    0.57960  -0.499   0.6179
## as.factor(Eastern_alr_cat)3               -0.58538    0.60492  -0.968   0.3332
## as.factor(Southeastern_alr_cat)2           1.35118    0.54947   2.459   0.0139
## as.factor(Southeastern_alr_cat)3           0.83440    0.58829   1.418   0.1561
## as.factor(age_cat)2                        0.03185    0.33431   0.095   0.9241
## as.factor(age_cat)3                        0.03314    0.39646   0.084   0.9334
## as.factor(age_cat)4                        0.15779    0.56537   0.279   0.7802
## patient_sexfemale                         -0.20237    0.31779  -0.637   0.5243
## TB_RF_smokingyes                          -0.51616    0.35843  -1.440   0.1498
## as.factor(cough_cat)2                     -0.74782    0.42300  -1.768   0.0771
## as.factor(cough_cat)3                     -0.12065    0.30102  -0.401   0.6886
## socio_cat2                                -0.25048    0.31928  -0.785   0.4327
## socio_cat3                                 0.17153    0.44511   0.385   0.7000
## TB_categoryrelapse                        -0.05243    1.15851  -0.045   0.9639
## general_examination_malnutritionyes        0.41028    0.38360   1.070   0.2848
## Educationno                                0.08301    0.42147   0.197   0.8439
## Educationsecondary                         0.16923    0.37872   0.447   0.6550
## Educationuniversity                       -0.39236    0.68054  -0.577   0.5643
## DR_categoryresistant                      -0.42738    0.79761  -0.536   0.5921
## as.factor(Western_alr_cat)2:Intro101       0.08132    1.16012   0.070   0.9441
## as.factor(Western_alr_cat)3:Intro101      -1.00725    1.06956  -0.942   0.3463
## as.factor(Western_alr_cat)4:Intro101      -1.68837    1.03160  -1.637   0.1017
## Intro101:as.factor(Eastern_alr_cat)2       1.90069    1.24917   1.522   0.1281
## Intro101:as.factor(Eastern_alr_cat)3       2.07009    1.25592   1.648   0.0993
## Intro101:as.factor(Southeastern_alr_cat)2 -0.99077    0.81421  -1.217   0.2237
## Intro101:as.factor(Southeastern_alr_cat)3  0.06626    0.83183   0.080   0.9365
##                                            
## (Intercept)                               *
## as.factor(Western_alr_cat)2                
## as.factor(Western_alr_cat)3                
## as.factor(Western_alr_cat)4                
## Intro101                                   
## as.factor(Eastern_alr_cat)2                
## as.factor(Eastern_alr_cat)3                
## as.factor(Southeastern_alr_cat)2          *
## as.factor(Southeastern_alr_cat)3           
## as.factor(age_cat)2                        
## as.factor(age_cat)3                        
## as.factor(age_cat)4                        
## patient_sexfemale                          
## TB_RF_smokingyes                           
## as.factor(cough_cat)2                     .
## as.factor(cough_cat)3                      
## socio_cat2                                 
## socio_cat3                                 
## TB_categoryrelapse                         
## general_examination_malnutritionyes        
## Educationno                                
## Educationsecondary                         
## Educationuniversity                        
## DR_categoryresistant                       
## as.factor(Western_alr_cat)2:Intro101       
## as.factor(Western_alr_cat)3:Intro101       
## as.factor(Western_alr_cat)4:Intro101       
## Intro101:as.factor(Eastern_alr_cat)2       
## Intro101:as.factor(Eastern_alr_cat)3      .
## Intro101:as.factor(Southeastern_alr_cat)2  
## Intro101:as.factor(Southeastern_alr_cat)3  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 421.34  on 475  degrees of freedom
## Residual deviance: 387.03  on 445  degrees of freedom
## AIC: 449.03
## 
## Number of Fisher Scoring iterations: 5
ORs <- exp(cbind(OR = coef(model1), confint(model1)))
## Waiting for profiling to be done...
round(ORs, digits = 2)
##                                             OR 2.5 % 97.5 %
## (Intercept)                               0.13  0.02   0.79
## as.factor(Western_alr_cat)2               0.63  0.14   3.46
## as.factor(Western_alr_cat)3               0.86  0.22   4.32
## as.factor(Western_alr_cat)4               1.08  0.30   5.16
## Intro101                                  1.15  0.04  19.97
## as.factor(Eastern_alr_cat)2               0.75  0.25   2.54
## as.factor(Eastern_alr_cat)3               0.56  0.18   1.97
## as.factor(Southeastern_alr_cat)2          3.86  1.41  12.65
## as.factor(Southeastern_alr_cat)3          2.30  0.77   8.00
## as.factor(age_cat)2                       1.03  0.53   1.99
## as.factor(age_cat)3                       1.03  0.47   2.22
## as.factor(age_cat)4                       1.17  0.35   3.35
## patient_sexfemale                         0.82  0.43   1.51
## TB_RF_smokingyes                          0.60  0.29   1.18
## as.factor(cough_cat)2                     0.47  0.20   1.05
## as.factor(cough_cat)3                     0.89  0.49   1.61
## socio_cat2                                0.78  0.41   1.44
## socio_cat3                                1.19  0.48   2.77
## TB_categoryrelapse                        0.95  0.05   6.80
## general_examination_malnutritionyes       1.51  0.69   3.14
## Educationno                               1.09  0.46   2.41
## Educationsecondary                        1.18  0.55   2.46
## Educationuniversity                       0.68  0.15   2.28
## DR_categoryresistant                      0.65  0.10   2.59
## as.factor(Western_alr_cat)2:Intro101      1.08  0.10  10.38
## as.factor(Western_alr_cat)3:Intro101      0.37  0.04   2.87
## as.factor(Western_alr_cat)4:Intro101      0.18  0.02   1.34
## Intro101:as.factor(Eastern_alr_cat)2      6.69  0.75 152.26
## Intro101:as.factor(Eastern_alr_cat)3      7.93  0.87 181.77
## Intro101:as.factor(Southeastern_alr_cat)2 0.37  0.07   1.86
## Intro101:as.factor(Southeastern_alr_cat)3 1.07  0.20   5.56
model2 <- glm(Xray_01 ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + TB_category + general_examination_malnutrition + Education+ DR_category, data = clean, family = binomial, na.action = na.exclude)

lrtest(model2, model1) 
## Likelihood ratio test
## 
## Model 1: Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + 
##     TB_category + general_examination_malnutrition + Education + 
##     DR_category
## Model 2: Xray_01 ~ as.factor(Western_alr_cat) * Intro10 + as.factor(Eastern_alr_cat) * 
##     Intro10 + as.factor(Southeastern_alr_cat) * Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + 
##     TB_category + general_examination_malnutrition + Education + 
##     DR_category
##   #Df  LogLik Df  Chisq Pr(>Chisq)  
## 1  24 -200.47                       
## 2  31 -193.52  7 13.908    0.05284 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

does removal of cough duration have an effect?

–> not really, values change slightly…

model1 <- glm(Xray_01 ~ as.factor(Western_alr_cat)* Intro10 + as.factor(Eastern_alr_cat) * Intro10 + as.factor(Southeastern_alr_cat)* Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + socio_cat + TB_category + general_examination_malnutrition + Education+ DR_category, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = Xray_01 ~ as.factor(Western_alr_cat) * Intro10 + 
##     as.factor(Eastern_alr_cat) * Intro10 + as.factor(Southeastern_alr_cat) * 
##     Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + 
##     socio_cat + TB_category + general_examination_malnutrition + 
##     Education + DR_category, family = binomial, data = clean, 
##     na.action = na.exclude)
## 
## Coefficients:
##                                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)                               -2.30239    0.92070  -2.501   0.0124
## as.factor(Western_alr_cat)2               -0.33045    0.79576  -0.415   0.6779
## as.factor(Western_alr_cat)3               -0.06567    0.73185  -0.090   0.9285
## as.factor(Western_alr_cat)4                0.17412    0.69961   0.249   0.8035
## Intro101                                   0.16748    1.52078   0.110   0.9123
## as.factor(Eastern_alr_cat)2               -0.22785    0.57661  -0.395   0.6927
## as.factor(Eastern_alr_cat)3               -0.55605    0.60249  -0.923   0.3561
## as.factor(Southeastern_alr_cat)2           1.36039    0.54846   2.480   0.0131
## as.factor(Southeastern_alr_cat)3           0.83597    0.58548   1.428   0.1533
## as.factor(age_cat)2                       -0.01692    0.33111  -0.051   0.9593
## as.factor(age_cat)3                        0.04379    0.39364   0.111   0.9114
## as.factor(age_cat)4                        0.05777    0.55986   0.103   0.9178
## patient_sexfemale                         -0.17224    0.31528  -0.546   0.5849
## TB_RF_smokingyes                          -0.44580    0.35284  -1.263   0.2064
## socio_cat2                                -0.26356    0.31907  -0.826   0.4088
## socio_cat3                                 0.20594    0.44109   0.467   0.6406
## TB_categoryrelapse                         0.07240    1.15210   0.063   0.9499
## general_examination_malnutritionyes        0.42712    0.37255   1.146   0.2516
## Educationno                                0.03588    0.41710   0.086   0.9314
## Educationsecondary                         0.14028    0.37522   0.374   0.7085
## Educationuniversity                       -0.35290    0.67402  -0.524   0.6006
## DR_categoryresistant                      -0.46402    0.79838  -0.581   0.5611
## as.factor(Western_alr_cat)2:Intro101      -0.13897    1.14500  -0.121   0.9034
## as.factor(Western_alr_cat)3:Intro101      -1.11454    1.05718  -1.054   0.2918
## as.factor(Western_alr_cat)4:Intro101      -1.75688    1.02416  -1.715   0.0863
## Intro101:as.factor(Eastern_alr_cat)2       1.85394    1.25497   1.477   0.1396
## Intro101:as.factor(Eastern_alr_cat)3       2.11859    1.26283   1.678   0.0934
## Intro101:as.factor(Southeastern_alr_cat)2 -0.94476    0.81266  -1.163   0.2450
## Intro101:as.factor(Southeastern_alr_cat)3  0.05187    0.82712   0.063   0.9500
##                                            
## (Intercept)                               *
## as.factor(Western_alr_cat)2                
## as.factor(Western_alr_cat)3                
## as.factor(Western_alr_cat)4                
## Intro101                                   
## as.factor(Eastern_alr_cat)2                
## as.factor(Eastern_alr_cat)3                
## as.factor(Southeastern_alr_cat)2          *
## as.factor(Southeastern_alr_cat)3           
## as.factor(age_cat)2                        
## as.factor(age_cat)3                        
## as.factor(age_cat)4                        
## patient_sexfemale                          
## TB_RF_smokingyes                           
## socio_cat2                                 
## socio_cat3                                 
## TB_categoryrelapse                         
## general_examination_malnutritionyes        
## Educationno                                
## Educationsecondary                         
## Educationuniversity                        
## DR_categoryresistant                       
## as.factor(Western_alr_cat)2:Intro101       
## as.factor(Western_alr_cat)3:Intro101       
## as.factor(Western_alr_cat)4:Intro101      .
## Intro101:as.factor(Eastern_alr_cat)2       
## Intro101:as.factor(Eastern_alr_cat)3      .
## Intro101:as.factor(Southeastern_alr_cat)2  
## Intro101:as.factor(Southeastern_alr_cat)3  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 421.34  on 475  degrees of freedom
## Residual deviance: 390.63  on 447  degrees of freedom
## AIC: 448.63
## 
## Number of Fisher Scoring iterations: 5
ORs <- exp(cbind(OR = coef(model1), confint(model1)))
## Waiting for profiling to be done...
round(ORs, digits = 2)
##                                             OR 2.5 % 97.5 %
## (Intercept)                               0.10  0.01   0.54
## as.factor(Western_alr_cat)2               0.72  0.16   3.90
## as.factor(Western_alr_cat)3               0.94  0.24   4.66
## as.factor(Western_alr_cat)4               1.19  0.34   5.66
## Intro101                                  1.18  0.04  20.48
## as.factor(Eastern_alr_cat)2               0.80  0.27   2.69
## as.factor(Eastern_alr_cat)3               0.57  0.18   2.02
## as.factor(Southeastern_alr_cat)2          3.90  1.43  12.75
## as.factor(Southeastern_alr_cat)3          2.31  0.77   7.98
## as.factor(age_cat)2                       0.98  0.51   1.88
## as.factor(age_cat)3                       1.04  0.47   2.23
## as.factor(age_cat)4                       1.06  0.32   2.99
## patient_sexfemale                         0.84  0.45   1.55
## TB_RF_smokingyes                          0.64  0.31   1.26
## socio_cat2                                0.77  0.40   1.42
## socio_cat3                                1.23  0.50   2.84
## TB_categoryrelapse                        1.08  0.05   7.61
## general_examination_malnutritionyes       1.53  0.72   3.12
## Educationno                               1.04  0.44   2.28
## Educationsecondary                        1.15  0.54   2.37
## Educationuniversity                       0.70  0.15   2.33
## DR_categoryresistant                      0.63  0.09   2.49
## as.factor(Western_alr_cat)2:Intro101      0.87  0.09   8.06
## as.factor(Western_alr_cat)3:Intro101      0.33  0.04   2.51
## as.factor(Western_alr_cat)4:Intro101      0.17  0.02   1.23
## Intro101:as.factor(Eastern_alr_cat)2      6.38  0.71 146.86
## Intro101:as.factor(Eastern_alr_cat)3      8.32  0.91 193.14
## Intro101:as.factor(Southeastern_alr_cat)2 0.39  0.08   1.94
## Intro101:as.factor(Southeastern_alr_cat)3 1.05  0.20   5.43
model2 <- glm(Xray_01 ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking  + socio_cat + TB_category + general_examination_malnutrition + Education+ DR_category, data = clean, family = binomial, na.action = na.exclude)

lrtest(model2, model1) 
## Likelihood ratio test
## 
## Model 1: Xray_01 ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + socio_cat + TB_category + general_examination_malnutrition + 
##     Education + DR_category
## Model 2: Xray_01 ~ as.factor(Western_alr_cat) * Intro10 + as.factor(Eastern_alr_cat) * 
##     Intro10 + as.factor(Southeastern_alr_cat) * Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + socio_cat + TB_category + general_examination_malnutrition + 
##     Education + DR_category
##   #Df  LogLik Df  Chisq Pr(>Chisq)  
## 1  22 -202.03                       
## 2  29 -195.31  7 13.427    0.06237 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

now for TB-score

create a model and look at the relationship

library(lmtest)

clean <- na.omit(new_data[new_data$HIV_status == "negative", c("Western_alr_cat", "Southeastern_alr_cat", "Eastern_alr_cat", "patient_sex", "age_cat", "symptoms_duration_cough_duration", "Intro10", "TB_RF_smoking", "TB_score_2cat", "Western_Bantu_alr", "Southeastern_Bantu_alr", "Eastern_Bantu_alr", "cough_cat", "socio_cat", "TB_category", "general_examination_malnutrition", "Education", "DR_category")])
dim(clean)
## [1] 827  18
model1 <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Southeastern_alr_cat) + as.factor(Eastern_alr_cat), data = clean, family = binomial, na.action = na.exclude)
print(summary(model1))
## 
## Call:
## glm(formula = TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Southeastern_alr_cat) + 
##     as.factor(Eastern_alr_cat), family = binomial, data = clean, 
##     na.action = na.exclude)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                       0.70490    0.33236   2.121   0.0339 *
## as.factor(Western_alr_cat)2       0.20267    0.33418   0.606   0.5442  
## as.factor(Western_alr_cat)3       0.28739    0.29728   0.967   0.3337  
## as.factor(Western_alr_cat)4       0.21666    0.29239   0.741   0.4587  
## as.factor(Southeastern_alr_cat)2 -0.34864    0.20122  -1.733   0.0832 .
## as.factor(Southeastern_alr_cat)3  0.09367    0.21729   0.431   0.6664  
## as.factor(Eastern_alr_cat)2       0.05064    0.24584   0.206   0.8368  
## as.factor(Eastern_alr_cat)3      -0.09498    0.25347  -0.375   0.7079  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.6  on 826  degrees of freedom
## Residual deviance: 1018.8  on 819  degrees of freedom
## AIC: 1034.8
## 
## Number of Fisher Scoring iterations: 4
ORs <- exp(cbind(OR = coef(model1), confint(model1)))
## Waiting for profiling to be done...
round(ORs, digits = 2)
##                                    OR 2.5 % 97.5 %
## (Intercept)                      2.02  1.07   3.94
## as.factor(Western_alr_cat)2      1.22  0.63   2.35
## as.factor(Western_alr_cat)3      1.33  0.74   2.37
## as.factor(Western_alr_cat)4      1.24  0.69   2.18
## as.factor(Southeastern_alr_cat)2 0.71  0.47   1.04
## as.factor(Southeastern_alr_cat)3 1.10  0.72   1.68
## as.factor(Eastern_alr_cat)2      1.05  0.64   1.69
## as.factor(Eastern_alr_cat)3      0.91  0.55   1.49
model2 <- glm(TB_score_2cat ~ 1, data = clean, family = binomial, na.action = na.exclude)
lrtest(model1, model2)
## Likelihood ratio test
## 
## Model 1: TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Southeastern_alr_cat) + 
##     as.factor(Eastern_alr_cat)
## Model 2: TB_score_2cat ~ 1
##   #Df  LogLik Df  Chisq Pr(>Chisq)
## 1   8 -509.39                     
## 2   1 -513.29 -7 7.8013     0.3504
# add the introduction
model1 <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10, family = binomial, 
##     data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                       0.68930    0.33714   2.045   0.0409 *
## as.factor(Western_alr_cat)2       0.20506    0.33431   0.613   0.5396  
## as.factor(Western_alr_cat)3       0.28703    0.29725   0.966   0.3342  
## as.factor(Western_alr_cat)4       0.21841    0.29245   0.747   0.4552  
## as.factor(Eastern_alr_cat)2       0.05215    0.24587   0.212   0.8320  
## as.factor(Eastern_alr_cat)3      -0.09711    0.25357  -0.383   0.7018  
## as.factor(Southeastern_alr_cat)2 -0.35010    0.20130  -1.739   0.0820 .
## as.factor(Southeastern_alr_cat)3  0.09287    0.21731   0.427   0.6691  
## Intro101                          0.04348    0.15768   0.276   0.7827  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.6  on 826  degrees of freedom
## Residual deviance: 1018.7  on 818  degrees of freedom
## AIC: 1036.7
## 
## Number of Fisher Scoring iterations: 4
# add the covariates
# + age + patient_sex + TB_RF_smoking + HIV_status + symptoms_duration_cough_duration + socioeconomic_status

# do it one by one
# age
model1 <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat), data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat), 
##     family = binomial, data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                       0.52630    0.34735   1.515   0.1297  
## as.factor(Western_alr_cat)2       0.24284    0.33587   0.723   0.4697  
## as.factor(Western_alr_cat)3       0.30920    0.29822   1.037   0.2998  
## as.factor(Western_alr_cat)4       0.23192    0.29411   0.789   0.4304  
## as.factor(Eastern_alr_cat)2       0.04839    0.24681   0.196   0.8446  
## as.factor(Eastern_alr_cat)3      -0.09932    0.25479  -0.390   0.6967  
## as.factor(Southeastern_alr_cat)2 -0.37162    0.20240  -1.836   0.0664 .
## as.factor(Southeastern_alr_cat)3  0.09299    0.21854   0.426   0.6705  
## Intro101                          0.05247    0.15853   0.331   0.7407  
## as.factor(age_cat)2               0.37085    0.17707   2.094   0.0362 *
## as.factor(age_cat)3               0.02118    0.21399   0.099   0.9211  
## as.factor(age_cat)4               0.37656    0.31716   1.187   0.2351  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.6  on 826  degrees of freedom
## Residual deviance: 1013.1  on 815  degrees of freedom
## AIC: 1037.1
## 
## Number of Fisher Scoring iterations: 4
# add the covariates
# + age + patient_sex + TB_RF_smoking + HIV_status + symptoms_duration_cough_duration

# do it one by one
# sex

model1 <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex, family = binomial, data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                       0.63772    0.35773   1.783   0.0746 .
## as.factor(Western_alr_cat)2       0.22245    0.33652   0.661   0.5086  
## as.factor(Western_alr_cat)3       0.30203    0.29870   1.011   0.3120  
## as.factor(Western_alr_cat)4       0.21834    0.29463   0.741   0.4587  
## as.factor(Eastern_alr_cat)2       0.02713    0.24754   0.110   0.9127  
## as.factor(Eastern_alr_cat)3      -0.10608    0.25506  -0.416   0.6775  
## as.factor(Southeastern_alr_cat)2 -0.37089    0.20269  -1.830   0.0673 .
## as.factor(Southeastern_alr_cat)3  0.09722    0.21879   0.444   0.6568  
## Intro101                          0.03381    0.15927   0.212   0.8319  
## as.factor(age_cat)2               0.34720    0.17812   1.949   0.0513 .
## as.factor(age_cat)3              -0.01388    0.21577  -0.064   0.9487  
## as.factor(age_cat)4               0.32375    0.31982   1.012   0.3114  
## patient_sexfemale                -0.23097    0.17026  -1.357   0.1749  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.6  on 826  degrees of freedom
## Residual deviance: 1011.3  on 814  degrees of freedom
## AIC: 1037.3
## 
## Number of Fisher Scoring iterations: 4
# add the covariates
# + age + patient_sex + TB_RF_smoking + symptoms_duration_cough_duration

# do it one by one
# smoking

model1 <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking, family = binomial, data = clean, 
##     na.action = na.exclude)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                       0.69694    0.36071   1.932   0.0533 .
## as.factor(Western_alr_cat)2       0.20345    0.33728   0.603   0.5464  
## as.factor(Western_alr_cat)3       0.30593    0.29911   1.023   0.3064  
## as.factor(Western_alr_cat)4       0.20414    0.29528   0.691   0.4894  
## as.factor(Eastern_alr_cat)2       0.04765    0.24835   0.192   0.8478  
## as.factor(Eastern_alr_cat)3      -0.09191    0.25552  -0.360   0.7191  
## as.factor(Southeastern_alr_cat)2 -0.36857    0.20295  -1.816   0.0694 .
## as.factor(Southeastern_alr_cat)3  0.09789    0.21895   0.447   0.6548  
## Intro101                          0.04199    0.15957   0.263   0.7924  
## as.factor(age_cat)2               0.37346    0.17943   2.081   0.0374 *
## as.factor(age_cat)3               0.05038    0.22112   0.228   0.8198  
## as.factor(age_cat)4               0.37767    0.32263   1.171   0.2418  
## patient_sexfemale                -0.31936    0.18241  -1.751   0.0800 .
## TB_RF_smokingyes                 -0.27066    0.19257  -1.406   0.1599  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.6  on 826  degrees of freedom
## Residual deviance: 1009.4  on 813  degrees of freedom
## AIC: 1037.4
## 
## Number of Fisher Scoring iterations: 4
# add the covariates
# + age + patient_sex + TB_RF_smoking + symptoms_duration_cough_duration

# do it one by one
# cough duration

model1 <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat), data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat), family = binomial, 
##     data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                       0.93537    0.39941   2.342   0.0192 *
## as.factor(Western_alr_cat)2       0.17151    0.33866   0.506   0.6126  
## as.factor(Western_alr_cat)3       0.29047    0.30015   0.968   0.3332  
## as.factor(Western_alr_cat)4       0.17752    0.29657   0.599   0.5495  
## as.factor(Eastern_alr_cat)2       0.05218    0.24883   0.210   0.8339  
## as.factor(Eastern_alr_cat)3      -0.09994    0.25601  -0.390   0.6963  
## as.factor(Southeastern_alr_cat)2 -0.38882    0.20392  -1.907   0.0566 .
## as.factor(Southeastern_alr_cat)3  0.07958    0.21948   0.363   0.7169  
## Intro101                          0.05349    0.16027   0.334   0.7386  
## as.factor(age_cat)2               0.37649    0.17982   2.094   0.0363 *
## as.factor(age_cat)3               0.05978    0.22173   0.270   0.7875  
## as.factor(age_cat)4               0.38458    0.32344   1.189   0.2344  
## patient_sexfemale                -0.31405    0.18296  -1.717   0.0861 .
## TB_RF_smokingyes                 -0.28605    0.19309  -1.481   0.1385  
## as.factor(cough_cat)2            -0.18385    0.23603  -0.779   0.4360  
## as.factor(cough_cat)3            -0.29979    0.18927  -1.584   0.1132  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.6  on 826  degrees of freedom
## Residual deviance: 1006.8  on 811  degrees of freedom
## AIC: 1038.8
## 
## Number of Fisher Scoring iterations: 4
# add the covariates
# + age + patient_sex + TB_RF_smoking + symptoms_duration_cough_duration + socioeconomic_status

# do it one by one
# socioeconomic_status

model1 <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat, 
##     family = binomial, data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                       0.71842    0.41195   1.744   0.0812 .
## as.factor(Western_alr_cat)2       0.18827    0.33952   0.555   0.5792  
## as.factor(Western_alr_cat)3       0.28172    0.30079   0.937   0.3490  
## as.factor(Western_alr_cat)4       0.13398    0.29795   0.450   0.6529  
## as.factor(Eastern_alr_cat)2       0.06737    0.24964   0.270   0.7873  
## as.factor(Eastern_alr_cat)3      -0.07102    0.25702  -0.276   0.7823  
## as.factor(Southeastern_alr_cat)2 -0.36137    0.20600  -1.754   0.0794 .
## as.factor(Southeastern_alr_cat)3  0.15173    0.22299   0.680   0.4962  
## Intro101                          0.05134    0.16080   0.319   0.7495  
## as.factor(age_cat)2               0.38071    0.18054   2.109   0.0350 *
## as.factor(age_cat)3               0.06618    0.22248   0.297   0.7661  
## as.factor(age_cat)4               0.40480    0.32496   1.246   0.2129  
## patient_sexfemale                -0.26222    0.18522  -1.416   0.1569  
## TB_RF_smokingyes                 -0.30662    0.19507  -1.572   0.1160  
## as.factor(cough_cat)2            -0.16481    0.23708  -0.695   0.4869  
## as.factor(cough_cat)3            -0.29879    0.19042  -1.569   0.1166  
## socio_cat2                        0.36679    0.17564   2.088   0.0368 *
## socio_cat3                        0.34769    0.24187   1.437   0.1506  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.6  on 826  degrees of freedom
## Residual deviance: 1001.6  on 809  degrees of freedom
## AIC: 1037.6
## 
## Number of Fisher Scoring iterations: 4
# add the covariates

# do it one by one
model1 <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + general_examination_malnutrition, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + 
##     general_examination_malnutrition, family = binomial, data = clean, 
##     na.action = na.exclude)
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          1.086537   0.436956   2.487   0.0129 *  
## as.factor(Western_alr_cat)2          0.018147   0.363150   0.050   0.9601    
## as.factor(Western_alr_cat)3          0.038595   0.323091   0.119   0.9049    
## as.factor(Western_alr_cat)4         -0.134194   0.320310  -0.419   0.6753    
## as.factor(Eastern_alr_cat)2          0.128064   0.259859   0.493   0.6221    
## as.factor(Eastern_alr_cat)3          0.055175   0.267752   0.206   0.8367    
## as.factor(Southeastern_alr_cat)2    -0.471217   0.216684  -2.175   0.0297 *  
## as.factor(Southeastern_alr_cat)3     0.059076   0.234298   0.252   0.8009    
## Intro101                            -0.006882   0.167821  -0.041   0.9673    
## as.factor(age_cat)2                  0.366365   0.188392   1.945   0.0518 .  
## as.factor(age_cat)3                  0.046685   0.232749   0.201   0.8410    
## as.factor(age_cat)4                  0.431431   0.340247   1.268   0.2048    
## patient_sexfemale                   -0.244903   0.192871  -1.270   0.2042    
## TB_RF_smokingyes                    -0.132489   0.206049  -0.643   0.5202    
## as.factor(cough_cat)2               -0.149243   0.245830  -0.607   0.5438    
## as.factor(cough_cat)3               -0.220901   0.197253  -1.120   0.2628    
## socio_cat2                           0.472632   0.185684   2.545   0.0109 *  
## socio_cat3                           0.295884   0.251026   1.179   0.2385    
## general_examination_malnutritionyes -1.873332   0.249238  -7.516 5.64e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.59  on 826  degrees of freedom
## Residual deviance:  939.15  on 808  degrees of freedom
## AIC: 977.15
## 
## Number of Fisher Scoring iterations: 4
# add the covariates

# do it one by one
model1 <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + general_examination_malnutrition + TB_category, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + 
##     general_examination_malnutrition + TB_category, family = binomial, 
##     data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          1.06671    0.43775   2.437   0.0148 *  
## as.factor(Western_alr_cat)2          0.01939    0.36356   0.053   0.9575    
## as.factor(Western_alr_cat)3          0.05007    0.32370   0.155   0.8771    
## as.factor(Western_alr_cat)4         -0.12293    0.32090  -0.383   0.7017    
## as.factor(Eastern_alr_cat)2          0.13856    0.26040   0.532   0.5947    
## as.factor(Eastern_alr_cat)3          0.05188    0.26802   0.194   0.8465    
## as.factor(Southeastern_alr_cat)2    -0.46632    0.21696  -2.149   0.0316 *  
## as.factor(Southeastern_alr_cat)3     0.07586    0.23526   0.322   0.7471    
## Intro101                             0.00385    0.16832   0.023   0.9818    
## as.factor(age_cat)2                  0.37088    0.18865   1.966   0.0493 *  
## as.factor(age_cat)3                  0.05200    0.23298   0.223   0.8234    
## as.factor(age_cat)4                  0.44592    0.34042   1.310   0.1902    
## patient_sexfemale                   -0.24775    0.19304  -1.283   0.1993    
## TB_RF_smokingyes                    -0.12296    0.20647  -0.596   0.5515    
## as.factor(cough_cat)2               -0.15802    0.24615  -0.642   0.5209    
## as.factor(cough_cat)3               -0.21722    0.19737  -1.101   0.2711    
## socio_cat2                           0.47638    0.18594   2.562   0.0104 *  
## socio_cat3                           0.29646    0.25117   1.180   0.2379    
## general_examination_malnutritionyes -1.88471    0.24970  -7.548 4.42e-14 ***
## TB_categoryrelapse                  -0.65192    0.65412  -0.997   0.3189    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.59  on 826  degrees of freedom
## Residual deviance:  938.22  on 807  degrees of freedom
## AIC: 978.22
## 
## Number of Fisher Scoring iterations: 4
# add the covariates

# do it one by one
model1 <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + general_examination_malnutrition + TB_category + Education, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + 
##     general_examination_malnutrition + TB_category + Education, 
##     family = binomial, data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          1.167458   0.444357   2.627  0.00861 ** 
## as.factor(Western_alr_cat)2          0.006276   0.364738   0.017  0.98627    
## as.factor(Western_alr_cat)3          0.065362   0.324585   0.201  0.84041    
## as.factor(Western_alr_cat)4         -0.088686   0.322586  -0.275  0.78338    
## as.factor(Eastern_alr_cat)2          0.151207   0.261237   0.579  0.56272    
## as.factor(Eastern_alr_cat)3          0.064851   0.269427   0.241  0.80979    
## as.factor(Southeastern_alr_cat)2    -0.486446   0.218141  -2.230  0.02575 *  
## as.factor(Southeastern_alr_cat)3     0.044320   0.239368   0.185  0.85311    
## Intro101                             0.007642   0.168652   0.045  0.96386    
## as.factor(age_cat)2                  0.304707   0.197869   1.540  0.12358    
## as.factor(age_cat)3                 -0.018197   0.239702  -0.076  0.93949    
## as.factor(age_cat)4                  0.398312   0.343699   1.159  0.24650    
## patient_sexfemale                   -0.249638   0.194145  -1.286  0.19850    
## TB_RF_smokingyes                    -0.128587   0.208658  -0.616  0.53772    
## as.factor(cough_cat)2               -0.156419   0.246721  -0.634  0.52608    
## as.factor(cough_cat)3               -0.223643   0.198169  -1.129  0.25909    
## socio_cat2                           0.480158   0.187127   2.566  0.01029 *  
## socio_cat3                           0.285483   0.254471   1.122  0.26192    
## general_examination_malnutritionyes -1.873419   0.252732  -7.413 1.24e-13 ***
## TB_categoryrelapse                  -0.641757   0.652758  -0.983  0.32553    
## Educationno                         -0.114220   0.254724  -0.448  0.65386    
## Educationsecondary                  -0.318200   0.221965  -1.434  0.15170    
## Educationuniversity                  0.086127   0.416952   0.207  0.83635    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.6  on 826  degrees of freedom
## Residual deviance:  935.9  on 804  degrees of freedom
## AIC: 981.9
## 
## Number of Fisher Scoring iterations: 4
# add the covariates

# do it one by one
model1 <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat)+ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + general_examination_malnutrition + TB_category + Education + DR_category, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + 
##     general_examination_malnutrition + TB_category + Education + 
##     DR_category, family = binomial, data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          1.14273    0.44427   2.572   0.0101 *  
## as.factor(Western_alr_cat)2          0.01017    0.36477   0.028   0.9778    
## as.factor(Western_alr_cat)3          0.07840    0.32492   0.241   0.8093    
## as.factor(Western_alr_cat)4         -0.07257    0.32295  -0.225   0.8222    
## as.factor(Eastern_alr_cat)2          0.15699    0.26218   0.599   0.5493    
## as.factor(Eastern_alr_cat)3          0.07763    0.27054   0.287   0.7742    
## as.factor(Southeastern_alr_cat)2    -0.50031    0.21867  -2.288   0.0221 *  
## as.factor(Southeastern_alr_cat)3     0.04098    0.23960   0.171   0.8642    
## Intro101                             0.01527    0.16894   0.090   0.9280    
## as.factor(age_cat)2                  0.29100    0.19811   1.469   0.1419    
## as.factor(age_cat)3                 -0.02920    0.24028  -0.122   0.9033    
## as.factor(age_cat)4                  0.38147    0.34426   1.108   0.2678    
## patient_sexfemale                   -0.25589    0.19440  -1.316   0.1881    
## TB_RF_smokingyes                    -0.12069    0.20907  -0.577   0.5638    
## as.factor(cough_cat)2               -0.15967    0.24702  -0.646   0.5180    
## as.factor(cough_cat)3               -0.22468    0.19821  -1.134   0.2570    
## socio_cat2                           0.47462    0.18749   2.531   0.0114 *  
## socio_cat3                           0.28641    0.25426   1.126   0.2600    
## general_examination_malnutritionyes -1.87628    0.25310  -7.413 1.23e-13 ***
## TB_categoryrelapse                  -0.62827    0.65276  -0.962   0.3358    
## Educationno                         -0.11636    0.25462  -0.457   0.6477    
## Educationsecondary                  -0.31096    0.22216  -1.400   0.1616    
## Educationuniversity                  0.09543    0.41745   0.229   0.8192    
## DR_categoryresistant                 0.46484    0.44231   1.051   0.2933    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.59  on 826  degrees of freedom
## Residual deviance:  934.73  on 803  degrees of freedom
## AIC: 982.73
## 
## Number of Fisher Scoring iterations: 4

pvalue for the ancestries

same with more covariates

library(lmtest)

model1 <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking+ as.factor(cough_cat) + socio_cat+ general_examination_malnutrition + TB_category + Education + DR_category, data = clean, family = binomial, na.action = na.exclude)

model2 <- glm(TB_score_2cat ~ Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking+ as.factor(cough_cat) + socio_cat+ general_examination_malnutrition + TB_category + Education + DR_category, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + 
##     general_examination_malnutrition + TB_category + Education + 
##     DR_category, family = binomial, data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          1.14273    0.44427   2.572   0.0101 *  
## as.factor(Western_alr_cat)2          0.01017    0.36477   0.028   0.9778    
## as.factor(Western_alr_cat)3          0.07840    0.32492   0.241   0.8093    
## as.factor(Western_alr_cat)4         -0.07257    0.32295  -0.225   0.8222    
## as.factor(Eastern_alr_cat)2          0.15699    0.26218   0.599   0.5493    
## as.factor(Eastern_alr_cat)3          0.07763    0.27054   0.287   0.7742    
## as.factor(Southeastern_alr_cat)2    -0.50031    0.21867  -2.288   0.0221 *  
## as.factor(Southeastern_alr_cat)3     0.04098    0.23960   0.171   0.8642    
## Intro101                             0.01527    0.16894   0.090   0.9280    
## as.factor(age_cat)2                  0.29100    0.19811   1.469   0.1419    
## as.factor(age_cat)3                 -0.02920    0.24028  -0.122   0.9033    
## as.factor(age_cat)4                  0.38147    0.34426   1.108   0.2678    
## patient_sexfemale                   -0.25589    0.19440  -1.316   0.1881    
## TB_RF_smokingyes                    -0.12069    0.20907  -0.577   0.5638    
## as.factor(cough_cat)2               -0.15967    0.24702  -0.646   0.5180    
## as.factor(cough_cat)3               -0.22468    0.19821  -1.134   0.2570    
## socio_cat2                           0.47462    0.18749   2.531   0.0114 *  
## socio_cat3                           0.28641    0.25426   1.126   0.2600    
## general_examination_malnutritionyes -1.87628    0.25310  -7.413 1.23e-13 ***
## TB_categoryrelapse                  -0.62827    0.65276  -0.962   0.3358    
## Educationno                         -0.11636    0.25462  -0.457   0.6477    
## Educationsecondary                  -0.31096    0.22216  -1.400   0.1616    
## Educationuniversity                  0.09543    0.41745   0.229   0.8192    
## DR_categoryresistant                 0.46484    0.44231   1.051   0.2933    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.59  on 826  degrees of freedom
## Residual deviance:  934.73  on 803  degrees of freedom
## AIC: 982.73
## 
## Number of Fisher Scoring iterations: 4
lrtest(model1, model2)
## Likelihood ratio test
## 
## Model 1: TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + 
##     general_examination_malnutrition + TB_category + Education + 
##     DR_category
## Model 2: TB_score_2cat ~ Intro10 + as.factor(age_cat) + patient_sex + 
##     TB_RF_smoking + as.factor(cough_cat) + socio_cat + general_examination_malnutrition + 
##     TB_category + Education + DR_category
##   #Df  LogLik Df  Chisq Pr(>Chisq)
## 1  24 -467.36                     
## 2  17 -472.71 -7 10.698     0.1523
ORs <- exp(cbind(OR = coef(model1), confint(model1)))
## Waiting for profiling to be done...
round(ORs, digits = 2)
##                                       OR 2.5 % 97.5 %
## (Intercept)                         3.14  1.33   7.59
## as.factor(Western_alr_cat)2         1.01  0.49   2.05
## as.factor(Western_alr_cat)3         1.08  0.56   2.02
## as.factor(Western_alr_cat)4         0.93  0.49   1.73
## as.factor(Eastern_alr_cat)2         1.17  0.69   1.95
## as.factor(Eastern_alr_cat)3         1.08  0.63   1.83
## as.factor(Southeastern_alr_cat)2    0.61  0.39   0.93
## as.factor(Southeastern_alr_cat)3    1.04  0.65   1.66
## Intro101                            1.02  0.73   1.42
## as.factor(age_cat)2                 1.34  0.91   1.98
## as.factor(age_cat)3                 0.97  0.61   1.56
## as.factor(age_cat)4                 1.46  0.76   2.94
## patient_sexfemale                   0.77  0.53   1.13
## TB_RF_smokingyes                    0.89  0.59   1.34
## as.factor(cough_cat)2               0.85  0.53   1.39
## as.factor(cough_cat)3               0.80  0.54   1.17
## socio_cat2                          1.61  1.12   2.33
## socio_cat3                          1.33  0.82   2.21
## general_examination_malnutritionyes 0.15  0.09   0.25
## TB_categoryrelapse                  0.53  0.15   2.12
## Educationno                         0.89  0.54   1.48
## Educationsecondary                  0.73  0.48   1.14
## Educationuniversity                 1.10  0.50   2.62
## DR_categoryresistant                1.59  0.70   4.04

add the interaction

with more covariates

library(lmtest)

model1 <- glm(TB_score_2cat ~ as.factor(Western_alr_cat)* Intro10 + as.factor(Eastern_alr_cat) * Intro10 + as.factor(Southeastern_alr_cat)* Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking+ as.factor(cough_cat) + socio_cat + general_examination_malnutrition + TB_category + Education + DR_category, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = TB_score_2cat ~ as.factor(Western_alr_cat) * Intro10 + 
##     as.factor(Eastern_alr_cat) * Intro10 + as.factor(Southeastern_alr_cat) * 
##     Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + 
##     as.factor(cough_cat) + socio_cat + general_examination_malnutrition + 
##     TB_category + Education + DR_category, family = binomial, 
##     data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)                                1.46849    0.53349   2.753  0.00591
## as.factor(Western_alr_cat)2               -0.28675    0.46376  -0.618  0.53636
## as.factor(Western_alr_cat)3               -0.07394    0.42219  -0.175  0.86098
## as.factor(Western_alr_cat)4               -0.29555    0.41773  -0.708  0.47925
## Intro101                                  -0.86340    0.73782  -1.170  0.24192
## as.factor(Eastern_alr_cat)2                0.05067    0.32837   0.154  0.87737
## as.factor(Eastern_alr_cat)3               -0.07201    0.34529  -0.209  0.83480
## as.factor(Southeastern_alr_cat)2          -0.55587    0.27225  -2.042  0.04118
## as.factor(Southeastern_alr_cat)3           0.04079    0.29617   0.138  0.89047
## as.factor(age_cat)2                        0.28276    0.19927   1.419  0.15591
## as.factor(age_cat)3                       -0.02029    0.24114  -0.084  0.93294
## as.factor(age_cat)4                        0.38609    0.34616   1.115  0.26470
## patient_sexfemale                         -0.25779    0.19549  -1.319  0.18727
## TB_RF_smokingyes                          -0.10814    0.21008  -0.515  0.60672
## as.factor(cough_cat)2                     -0.16449    0.24849  -0.662  0.50799
## as.factor(cough_cat)3                     -0.22390    0.19932  -1.123  0.26130
## socio_cat2                                 0.48885    0.18837   2.595  0.00945
## socio_cat3                                 0.28663    0.25560   1.121  0.26211
## general_examination_malnutritionyes       -1.88138    0.25416  -7.402 1.34e-13
## TB_categoryrelapse                        -0.60959    0.65664  -0.928  0.35323
## Educationno                               -0.10578    0.25685  -0.412  0.68045
## Educationsecondary                        -0.31445    0.22406  -1.403  0.16050
## Educationuniversity                        0.06849    0.41872   0.164  0.87006
## DR_categoryresistant                       0.46243    0.44616   1.036  0.29999
## as.factor(Western_alr_cat)2:Intro101       0.83716    0.77162   1.085  0.27795
## as.factor(Western_alr_cat)3:Intro101       0.33895    0.66914   0.507  0.61248
## as.factor(Western_alr_cat)4:Intro101       0.55788    0.65876   0.847  0.39707
## Intro101:as.factor(Eastern_alr_cat)2       0.24273    0.55593   0.437  0.66239
## Intro101:as.factor(Eastern_alr_cat)3       0.37890    0.56201   0.674  0.50019
## Intro101:as.factor(Southeastern_alr_cat)2  0.25745    0.45792   0.562  0.57398
## Intro101:as.factor(Southeastern_alr_cat)3  0.04920    0.48695   0.101  0.91952
##                                              
## (Intercept)                               ** 
## as.factor(Western_alr_cat)2                  
## as.factor(Western_alr_cat)3                  
## as.factor(Western_alr_cat)4                  
## Intro101                                     
## as.factor(Eastern_alr_cat)2                  
## as.factor(Eastern_alr_cat)3                  
## as.factor(Southeastern_alr_cat)2          *  
## as.factor(Southeastern_alr_cat)3             
## as.factor(age_cat)2                          
## as.factor(age_cat)3                          
## as.factor(age_cat)4                          
## patient_sexfemale                            
## TB_RF_smokingyes                             
## as.factor(cough_cat)2                        
## as.factor(cough_cat)3                        
## socio_cat2                                ** 
## socio_cat3                                   
## general_examination_malnutritionyes       ***
## TB_categoryrelapse                           
## Educationno                                  
## Educationsecondary                           
## Educationuniversity                          
## DR_categoryresistant                         
## as.factor(Western_alr_cat)2:Intro101         
## as.factor(Western_alr_cat)3:Intro101         
## as.factor(Western_alr_cat)4:Intro101         
## Intro101:as.factor(Eastern_alr_cat)2         
## Intro101:as.factor(Eastern_alr_cat)3         
## Intro101:as.factor(Southeastern_alr_cat)2    
## Intro101:as.factor(Southeastern_alr_cat)3    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.59  on 826  degrees of freedom
## Residual deviance:  932.14  on 796  degrees of freedom
## AIC: 994.14
## 
## Number of Fisher Scoring iterations: 4
# compare to a model without the interaction
model2 <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat)+ as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking+ as.factor(cough_cat) + socio_cat+ general_examination_malnutrition + TB_category + Education + DR_category, data = clean, family = binomial, na.action = na.exclude)
lrtest(model1, model2)
## Likelihood ratio test
## 
## Model 1: TB_score_2cat ~ as.factor(Western_alr_cat) * Intro10 + as.factor(Eastern_alr_cat) * 
##     Intro10 + as.factor(Southeastern_alr_cat) * Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + 
##     general_examination_malnutrition + TB_category + Education + 
##     DR_category
## Model 2: TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + as.factor(cough_cat) + socio_cat + 
##     general_examination_malnutrition + TB_category + Education + 
##     DR_category
##   #Df  LogLik Df Chisq Pr(>Chisq)
## 1  31 -466.07                    
## 2  24 -467.36 -7 2.583     0.9207
ORs <- exp(cbind(OR = coef(model1), confint(model1)))
## Waiting for profiling to be done...
round(ORs, digits = 2)
##                                             OR 2.5 % 97.5 %
## (Intercept)                               4.34  1.56  12.69
## as.factor(Western_alr_cat)2               0.75  0.30   1.84
## as.factor(Western_alr_cat)3               0.93  0.39   2.08
## as.factor(Western_alr_cat)4               0.74  0.32   1.65
## Intro101                                  0.42  0.10   1.81
## as.factor(Eastern_alr_cat)2               1.05  0.54   1.98
## as.factor(Eastern_alr_cat)3               0.93  0.47   1.81
## as.factor(Southeastern_alr_cat)2          0.57  0.33   0.97
## as.factor(Southeastern_alr_cat)3          1.04  0.58   1.86
## as.factor(age_cat)2                       1.33  0.90   1.96
## as.factor(age_cat)3                       0.98  0.61   1.58
## as.factor(age_cat)4                       1.47  0.76   2.97
## patient_sexfemale                         0.77  0.53   1.14
## TB_RF_smokingyes                          0.90  0.60   1.36
## as.factor(cough_cat)2                     0.85  0.52   1.38
## as.factor(cough_cat)3                     0.80  0.54   1.18
## socio_cat2                                1.63  1.13   2.37
## socio_cat3                                1.33  0.81   2.22
## general_examination_malnutritionyes       0.15  0.09   0.25
## TB_categoryrelapse                        0.54  0.15   2.18
## Educationno                               0.90  0.55   1.50
## Educationsecondary                        0.73  0.47   1.14
## Educationuniversity                       1.07  0.49   2.56
## DR_categoryresistant                      1.59  0.69   4.06
## as.factor(Western_alr_cat)2:Intro101      2.31  0.51  10.65
## as.factor(Western_alr_cat)3:Intro101      1.40  0.38   5.25
## as.factor(Western_alr_cat)4:Intro101      1.75  0.48   6.39
## Intro101:as.factor(Eastern_alr_cat)2      1.27  0.42   3.78
## Intro101:as.factor(Eastern_alr_cat)3      1.46  0.48   4.39
## Intro101:as.factor(Southeastern_alr_cat)2 1.29  0.52   3.17
## Intro101:as.factor(Southeastern_alr_cat)3 1.05  0.40   2.72

does the removal of the cough duration variable change the results?

–> no it does not

library(lmtest)

model1 <- glm(TB_score_2cat ~ as.factor(Western_alr_cat)* Intro10 + as.factor(Eastern_alr_cat) * Intro10 + as.factor(Southeastern_alr_cat)* Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking+  socio_cat + general_examination_malnutrition + TB_category + Education + DR_category, data = clean, family = binomial, na.action = na.exclude)
summary(model1)
## 
## Call:
## glm(formula = TB_score_2cat ~ as.factor(Western_alr_cat) * Intro10 + 
##     as.factor(Eastern_alr_cat) * Intro10 + as.factor(Southeastern_alr_cat) * 
##     Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking + 
##     socio_cat + general_examination_malnutrition + TB_category + 
##     Education + DR_category, family = binomial, data = clean, 
##     na.action = na.exclude)
## 
## Coefficients:
##                                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)                                1.29477    0.50706   2.553  0.01067
## as.factor(Western_alr_cat)2               -0.28334    0.46418  -0.610  0.54159
## as.factor(Western_alr_cat)3               -0.08695    0.42276  -0.206  0.83704
## as.factor(Western_alr_cat)4               -0.29141    0.41859  -0.696  0.48633
## Intro101                                  -0.89491    0.73870  -1.211  0.22572
## as.factor(Eastern_alr_cat)2                0.05466    0.32786   0.167  0.86760
## as.factor(Eastern_alr_cat)3               -0.06192    0.34453  -0.180  0.85737
## as.factor(Southeastern_alr_cat)2          -0.53776    0.27134  -1.982  0.04750
## as.factor(Southeastern_alr_cat)3           0.05851    0.29541   0.198  0.84299
## as.factor(age_cat)2                        0.28057    0.19918   1.409  0.15895
## as.factor(age_cat)3                       -0.02518    0.24041  -0.105  0.91657
## as.factor(age_cat)4                        0.37966    0.34563   1.098  0.27200
## patient_sexfemale                         -0.26135    0.19516  -1.339  0.18051
## TB_RF_smokingyes                          -0.09774    0.20954  -0.466  0.64088
## socio_cat2                                 0.48509    0.18802   2.580  0.00988
## socio_cat3                                 0.30416    0.25508   1.192  0.23310
## general_examination_malnutritionyes       -1.89609    0.25351  -7.479 7.46e-14
## TB_categoryrelapse                        -0.62036    0.65347  -0.949  0.34245
## Educationno                               -0.10148    0.25629  -0.396  0.69215
## Educationsecondary                        -0.30756    0.22392  -1.374  0.16959
## Educationuniversity                        0.09174    0.41773   0.220  0.82617
## DR_categoryresistant                       0.46382    0.44616   1.040  0.29854
## as.factor(Western_alr_cat)2:Intro101       0.87734    0.76999   1.139  0.25453
## as.factor(Western_alr_cat)3:Intro101       0.38940    0.66867   0.582  0.56033
## as.factor(Western_alr_cat)4:Intro101       0.58315    0.65914   0.885  0.37631
## Intro101:as.factor(Eastern_alr_cat)2       0.22380    0.55489   0.403  0.68672
## Intro101:as.factor(Eastern_alr_cat)3       0.36973    0.56070   0.659  0.50963
## Intro101:as.factor(Southeastern_alr_cat)2  0.25430    0.45769   0.556  0.57848
## Intro101:as.factor(Southeastern_alr_cat)3  0.04665    0.48649   0.096  0.92361
##                                              
## (Intercept)                               *  
## as.factor(Western_alr_cat)2                  
## as.factor(Western_alr_cat)3                  
## as.factor(Western_alr_cat)4                  
## Intro101                                     
## as.factor(Eastern_alr_cat)2                  
## as.factor(Eastern_alr_cat)3                  
## as.factor(Southeastern_alr_cat)2          *  
## as.factor(Southeastern_alr_cat)3             
## as.factor(age_cat)2                          
## as.factor(age_cat)3                          
## as.factor(age_cat)4                          
## patient_sexfemale                            
## TB_RF_smokingyes                             
## socio_cat2                                ** 
## socio_cat3                                   
## general_examination_malnutritionyes       ***
## TB_categoryrelapse                           
## Educationno                                  
## Educationsecondary                           
## Educationuniversity                          
## DR_categoryresistant                         
## as.factor(Western_alr_cat)2:Intro101         
## as.factor(Western_alr_cat)3:Intro101         
## as.factor(Western_alr_cat)4:Intro101         
## Intro101:as.factor(Eastern_alr_cat)2         
## Intro101:as.factor(Eastern_alr_cat)3         
## Intro101:as.factor(Southeastern_alr_cat)2    
## Intro101:as.factor(Southeastern_alr_cat)3    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.59  on 826  degrees of freedom
## Residual deviance:  933.42  on 798  degrees of freedom
## AIC: 991.42
## 
## Number of Fisher Scoring iterations: 4
# compare to a model without the interaction
model2 <- glm(TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat)+ as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + patient_sex + TB_RF_smoking+ socio_cat+ general_examination_malnutrition + TB_category + Education + DR_category, data = clean, family = binomial, na.action = na.exclude)
lrtest(model1, model2)
## Likelihood ratio test
## 
## Model 1: TB_score_2cat ~ as.factor(Western_alr_cat) * Intro10 + as.factor(Eastern_alr_cat) * 
##     Intro10 + as.factor(Southeastern_alr_cat) * Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + socio_cat + general_examination_malnutrition + 
##     TB_category + Education + DR_category
## Model 2: TB_score_2cat ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + as.factor(age_cat) + 
##     patient_sex + TB_RF_smoking + socio_cat + general_examination_malnutrition + 
##     TB_category + Education + DR_category
##   #Df  LogLik Df  Chisq Pr(>Chisq)
## 1  29 -466.71                     
## 2  22 -468.01 -7 2.6048      0.919
ORs <- exp(cbind(OR = coef(model1), confint(model1)))
## Waiting for profiling to be done...
round(ORs, digits = 2)
##                                             OR 2.5 % 97.5 %
## (Intercept)                               3.65  1.38  10.15
## as.factor(Western_alr_cat)2               0.75  0.30   1.85
## as.factor(Western_alr_cat)3               0.92  0.39   2.06
## as.factor(Western_alr_cat)4               0.75  0.32   1.66
## Intro101                                  0.41  0.10   1.75
## as.factor(Eastern_alr_cat)2               1.06  0.55   1.99
## as.factor(Eastern_alr_cat)3               0.94  0.47   1.83
## as.factor(Southeastern_alr_cat)2          0.58  0.34   0.99
## as.factor(Southeastern_alr_cat)3          1.06  0.59   1.89
## as.factor(age_cat)2                       1.32  0.90   1.96
## as.factor(age_cat)3                       0.98  0.61   1.57
## as.factor(age_cat)4                       1.46  0.76   2.95
## patient_sexfemale                         0.77  0.53   1.13
## TB_RF_smokingyes                          0.91  0.60   1.37
## socio_cat2                                1.62  1.13   2.36
## socio_cat3                                1.36  0.83   2.26
## general_examination_malnutritionyes       0.15  0.09   0.24
## TB_categoryrelapse                        0.54  0.15   2.14
## Educationno                               0.90  0.55   1.51
## Educationsecondary                        0.74  0.48   1.14
## Educationuniversity                       1.10  0.50   2.61
## DR_categoryresistant                      1.59  0.69   4.07
## as.factor(Western_alr_cat)2:Intro101      2.40  0.53  11.05
## as.factor(Western_alr_cat)3:Intro101      1.48  0.40   5.51
## as.factor(Western_alr_cat)4:Intro101      1.79  0.49   6.56
## Intro101:as.factor(Eastern_alr_cat)2      1.25  0.42   3.70
## Intro101:as.factor(Eastern_alr_cat)3      1.45  0.48   4.34
## Intro101:as.factor(Southeastern_alr_cat)2 1.29  0.52   3.15
## Intro101:as.factor(Southeastern_alr_cat)3 1.05  0.40   2.71

now for the Ct-value

Check, whether the assumptions for a glm are met

# create a clean dataset with no missing values for any of the ancestries, bacterial genotype, and the covariates included and it only comprises HIV negative patients

clean <- na.omit(new_data[new_data$HIV_status == "negative", c("Western_alr_cat", "Southeastern_alr_cat", "Eastern_alr_cat", "patient_sex", "age_cat", "symptoms_duration_cough_duration", "Intro10", "TB_RF_smoking", "Ct_value", "cough_cat", "socio_cat", "general_examination_malnutrition", "TB_category", "Education", "DR_category")])

dim(clean)
## [1] 472  15
# Linearity
# for the ancestries as category
model1 <- glm(Ct_value ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat), data = clean, na.action = na.exclude, family = "gaussian")


par(mfrow = c(1,3))
for (anc in c("Western_alr_cat", "Eastern_alr_cat", "Southeastern_alr_cat")){
boxplot(residuals(model1) ~ clean[[anc]], main = paste("Residuals vs ", anc))
abline(h = 0, col = "red")}

# Homoscedasticity

plot(model1, which = 1)

# Normality of Residuals
# QQ-Plot

qqnorm(residuals(model1))
qqline(residuals(model1), col = "red")

# Histogram of residuals

hist(residuals(model1), main = "Histogram of residuals", xlab = "Residuals")

–> the qq-plot does not look good I would say.

try a log transformation of the ct_value

# Linearity
# for the ancestries as category
model1 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat), data = clean, na.action = na.exclude, family = "gaussian")


par(mfrow = c(1,3))
for (anc in c("Western_alr_cat", "Eastern_alr_cat", "Southeastern_alr_cat")){
boxplot(residuals(model1) ~ clean[[anc]], main = paste("Residuals vs ", anc))
abline(h = 0, col = "red")}

# Homoscedasticity

plot(model1, which = 1)

# Normality of Residuals
# QQ-Plot

qqnorm(residuals(model1))
qqline(residuals(model1), col = "red")

# Histogram of residuals

hist(residuals(model1), main = "Histogram of residuals", xlab = "Residuals")

–> The qq-plot looks much better

continue with log10 transformation

library(lmtest)

# Linearity
# for the ancestries as category
model1 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat), data = clean, na.action = na.exclude, family = "gaussian")


summary(model1)
## 
## Call:
## glm(formula = log10(Ct_value) ~ as.factor(Western_alr_cat) + 
##     as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat), 
##     family = "gaussian", data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.2313393  0.0205035  60.055   <2e-16 ***
## as.factor(Western_alr_cat)2       0.0343062  0.0208663   1.644    0.101    
## as.factor(Western_alr_cat)3       0.0308501  0.0187430   1.646    0.100    
## as.factor(Western_alr_cat)4       0.0226902  0.0186398   1.217    0.224    
## as.factor(Eastern_alr_cat)2      -0.0047696  0.0149368  -0.319    0.750    
## as.factor(Eastern_alr_cat)3       0.0147742  0.0153897   0.960    0.338    
## as.factor(Southeastern_alr_cat)2  0.0144342  0.0124202   1.162    0.246    
## as.factor(Southeastern_alr_cat)3  0.0008782  0.0127372   0.069    0.945    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.009729422)
## 
##     Null deviance: 4.6156  on 471  degrees of freedom
## Residual deviance: 4.5145  on 464  degrees of freedom
## AIC: -837.18
## 
## Number of Fisher Scoring iterations: 2

add the introduction

library(lmtest)

# Linearity
# for the ancestries as category
model1 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + Intro10, data = clean, na.action = na.exclude, family = "gaussian")

model2 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat), data = clean, na.action = na.exclude, family = "gaussian")

summary(model1)
## 
## Call:
## glm(formula = log10(Ct_value) ~ as.factor(Western_alr_cat) + 
##     as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + 
##     Intro10, family = "gaussian", data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.2382529  0.0207702  59.617   <2e-16 ***
## as.factor(Western_alr_cat)2       0.0336703  0.0208111   1.618   0.1064    
## as.factor(Western_alr_cat)3       0.0305594  0.0186916   1.635   0.1027    
## as.factor(Western_alr_cat)4       0.0210707  0.0186078   1.132   0.2581    
## as.factor(Eastern_alr_cat)2      -0.0049609  0.0148957  -0.333   0.7393    
## as.factor(Eastern_alr_cat)3       0.0168406  0.0153858   1.095   0.2743    
## as.factor(Southeastern_alr_cat)2  0.0145739  0.0123860   1.177   0.2399    
## as.factor(Southeastern_alr_cat)3  0.0006072  0.0127027   0.048   0.9619    
## Intro101                         -0.0178019  0.0094024  -1.893   0.0589 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.009675524)
## 
##     Null deviance: 4.6156  on 471  degrees of freedom
## Residual deviance: 4.4798  on 463  degrees of freedom
## AIC: -838.82
## 
## Number of Fisher Scoring iterations: 2
lrtest(model1, model2)
## Likelihood ratio test
## 
## Model 1: log10(Ct_value) ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10
## Model 2: log10(Ct_value) ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat)
##   #Df LogLik Df  Chisq Pr(>Chisq)  
## 1  10 429.41                       
## 2   9 427.59 -1 3.6403     0.0564 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

add the covariates one by one

library(lmtest)

# sex
# for the ancestries as category
model1 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + Intro10 + patient_sex, data = clean, na.action = na.exclude, family = "gaussian")

summary(model1)
## 
## Call:
## glm(formula = log10(Ct_value) ~ as.factor(Western_alr_cat) + 
##     as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + 
##     Intro10 + patient_sex, family = "gaussian", data = clean, 
##     na.action = na.exclude)
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.2365419  0.0211833  58.374   <2e-16 ***
## as.factor(Western_alr_cat)2       0.0338272  0.0208330   1.624   0.1051    
## as.factor(Western_alr_cat)3       0.0306377  0.0187092   1.638   0.1022    
## as.factor(Western_alr_cat)4       0.0213706  0.0186380   1.147   0.2521    
## as.factor(Eastern_alr_cat)2      -0.0046009  0.0149335  -0.308   0.7582    
## as.factor(Eastern_alr_cat)3       0.0169047  0.0154002   1.098   0.2729    
## as.factor(Southeastern_alr_cat)2  0.0147696  0.0124058   1.191   0.2344    
## as.factor(Southeastern_alr_cat)3  0.0006629  0.0127147   0.052   0.9584    
## Intro101                         -0.0176160  0.0094212  -1.870   0.0621 .  
## patient_sexfemale                 0.0042923  0.0102094   0.420   0.6744    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.009692759)
## 
##     Null deviance: 4.6156  on 471  degrees of freedom
## Residual deviance: 4.4781  on 462  degrees of freedom
## AIC: -837
## 
## Number of Fisher Scoring iterations: 2
library(lmtest)

# age
# for the ancestries as category
model1 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + Intro10 + patient_sex + as.factor(age_cat), data = clean, na.action = na.exclude, family = "gaussian")

summary(model1)
## 
## Call:
## glm(formula = log10(Ct_value) ~ as.factor(Western_alr_cat) + 
##     as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + 
##     Intro10 + patient_sex + as.factor(age_cat), family = "gaussian", 
##     data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.2368544  0.0217637  56.831   <2e-16 ***
## as.factor(Western_alr_cat)2       0.0340527  0.0208940   1.630    0.104    
## as.factor(Western_alr_cat)3       0.0306916  0.0187777   1.634    0.103    
## as.factor(Western_alr_cat)4       0.0212797  0.0187192   1.137    0.256    
## as.factor(Eastern_alr_cat)2      -0.0048509  0.0149732  -0.324    0.746    
## as.factor(Eastern_alr_cat)3       0.0168910  0.0154657   1.092    0.275    
## as.factor(Southeastern_alr_cat)2  0.0147314  0.0124421   1.184    0.237    
## as.factor(Southeastern_alr_cat)3  0.0006154  0.0127598   0.048    0.962    
## Intro101                         -0.0169224  0.0094836  -1.784    0.075 .  
## patient_sexfemale                 0.0046668  0.0104160   0.448    0.654    
## as.factor(age_cat)2              -0.0016045  0.0106393  -0.151    0.880    
## as.factor(age_cat)3              -0.0055395  0.0134709  -0.411    0.681    
## as.factor(age_cat)4               0.0113204  0.0180264   0.628    0.530    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.009740243)
## 
##     Null deviance: 4.6156  on 471  degrees of freedom
## Residual deviance: 4.4708  on 459  degrees of freedom
## AIC: -831.77
## 
## Number of Fisher Scoring iterations: 2
library(lmtest)

# smoking
# for the ancestries as category
model1 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking, data = clean, na.action = na.exclude, family = "gaussian")

summary(model1)
## 
## Call:
## glm(formula = log10(Ct_value) ~ as.factor(Western_alr_cat) + 
##     as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + 
##     Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking, 
##     family = "gaussian", data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.2387850  0.0218015  56.821   <2e-16 ***
## as.factor(Western_alr_cat)2       0.0331143  0.0208928   1.585   0.1137    
## as.factor(Western_alr_cat)3       0.0311098  0.0187677   1.658   0.0981 .  
## as.factor(Western_alr_cat)4       0.0208751  0.0187092   1.116   0.2651    
## as.factor(Eastern_alr_cat)2      -0.0033005  0.0150123  -0.220   0.8261    
## as.factor(Eastern_alr_cat)3       0.0187428  0.0155232   1.207   0.2279    
## as.factor(Southeastern_alr_cat)2  0.0151121  0.0124372   1.215   0.2250    
## as.factor(Southeastern_alr_cat)3  0.0007898  0.0127519   0.062   0.9506    
## Intro101                         -0.0156806  0.0095271  -1.646   0.1005    
## patient_sexfemale                -0.0003614  0.0111307  -0.032   0.9741    
## as.factor(age_cat)2               0.0003732  0.0107446   0.035   0.9723    
## as.factor(age_cat)3              -0.0017997  0.0137775  -0.131   0.8961    
## as.factor(age_cat)4               0.0139664  0.0181333   0.770   0.4416    
## TB_RF_smokingyes                 -0.0146135  0.0114599  -1.275   0.2029    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.009726974)
## 
##     Null deviance: 4.6156  on 471  degrees of freedom
## Residual deviance: 4.4550  on 458  degrees of freedom
## AIC: -831.44
## 
## Number of Fisher Scoring iterations: 2
library(lmtest)

# cough duration
# for the ancestries as category
model1 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking + as.factor(cough_cat), data = clean, na.action = na.exclude, family = "gaussian")

summary(model1)
## 
## Call:
## glm(formula = log10(Ct_value) ~ as.factor(Western_alr_cat) + 
##     as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + 
##     Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking + 
##     as.factor(cough_cat), family = "gaussian", data = clean, 
##     na.action = na.exclude)
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.273618   0.023149  55.019  < 2e-16 ***
## as.factor(Western_alr_cat)2       0.026432   0.020672   1.279 0.201681    
## as.factor(Western_alr_cat)3       0.025369   0.018540   1.368 0.171862    
## as.factor(Western_alr_cat)4       0.014567   0.018497   0.788 0.431389    
## as.factor(Eastern_alr_cat)2      -0.001649   0.014790  -0.112 0.911253    
## as.factor(Eastern_alr_cat)3       0.017237   0.015300   1.127 0.260509    
## as.factor(Southeastern_alr_cat)2  0.011551   0.012287   0.940 0.347675    
## as.factor(Southeastern_alr_cat)3 -0.002972   0.012594  -0.236 0.813551    
## Intro101                         -0.010700   0.009464  -1.131 0.258849    
## patient_sexfemale                -0.001087   0.010965  -0.099 0.921061    
## as.factor(age_cat)2               0.000589   0.010586   0.056 0.955655    
## as.factor(age_cat)3              -0.001074   0.013570  -0.079 0.936949    
## as.factor(age_cat)4               0.015888   0.017865   0.889 0.374277    
## TB_RF_smokingyes                 -0.017050   0.011308  -1.508 0.132300    
## as.factor(cough_cat)2            -0.040415   0.013141  -3.076 0.002227 ** 
## as.factor(cough_cat)3            -0.039724   0.010471  -3.794 0.000168 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.009433898)
## 
##     Null deviance: 4.6156  on 471  degrees of freedom
## Residual deviance: 4.3019  on 456  degrees of freedom
## AIC: -843.95
## 
## Number of Fisher Scoring iterations: 2
library(lmtest)

# socioeconomic status
# for the ancestries as category
model1 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking + as.factor(cough_cat) + socio_cat, data = clean, na.action = na.exclude, family = "gaussian")

summary(model1)
## 
## Call:
## glm(formula = log10(Ct_value) ~ as.factor(Western_alr_cat) + 
##     as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + 
##     Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking + 
##     as.factor(cough_cat) + socio_cat, family = "gaussian", data = clean, 
##     na.action = na.exclude)
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.2710949  0.0241330  52.670  < 2e-16 ***
## as.factor(Western_alr_cat)2       0.0268639  0.0207674   1.294 0.196475    
## as.factor(Western_alr_cat)3       0.0254099  0.0185765   1.368 0.172037    
## as.factor(Western_alr_cat)4       0.0142943  0.0185401   0.771 0.441112    
## as.factor(Eastern_alr_cat)2      -0.0013832  0.0148299  -0.093 0.925727    
## as.factor(Eastern_alr_cat)3       0.0175194  0.0153576   1.141 0.254569    
## as.factor(Southeastern_alr_cat)2  0.0121944  0.0123914   0.984 0.325589    
## as.factor(Southeastern_alr_cat)3 -0.0021835  0.0127821  -0.171 0.864440    
## Intro101                         -0.0105840  0.0094853  -1.116 0.265086    
## patient_sexfemale                -0.0006988  0.0110128  -0.063 0.949430    
## as.factor(age_cat)2               0.0009028  0.0106319   0.085 0.932365    
## as.factor(age_cat)3              -0.0006416  0.0136224  -0.047 0.962457    
## as.factor(age_cat)4               0.0155325  0.0179655   0.865 0.387730    
## TB_RF_smokingyes                 -0.0181509  0.0115342  -1.574 0.116263    
## as.factor(cough_cat)2            -0.0405626  0.0131690  -3.080 0.002195 ** 
## as.factor(cough_cat)3            -0.0395175  0.0105171  -3.757 0.000194 ***
## socio_cat2                        0.0019392  0.0100697   0.193 0.847376    
## socio_cat3                        0.0072633  0.0142476   0.510 0.610447    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.009470011)
## 
##     Null deviance: 4.6156  on 471  degrees of freedom
## Residual deviance: 4.2994  on 454  degrees of freedom
## AIC: -840.22
## 
## Number of Fisher Scoring iterations: 2
library(lmtest)

# socioeconomic status
# for the ancestries as category
model1 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking + as.factor(cough_cat) + socio_cat + general_examination_malnutrition, data = clean, na.action = na.exclude, family = "gaussian")

summary(model1)
## 
## Call:
## glm(formula = log10(Ct_value) ~ as.factor(Western_alr_cat) + 
##     as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + 
##     Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking + 
##     as.factor(cough_cat) + socio_cat + general_examination_malnutrition, 
##     family = "gaussian", data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          1.2700377  0.0243609  52.134  < 2e-16 ***
## as.factor(Western_alr_cat)2          0.0273308  0.0208341   1.312  0.19024    
## as.factor(Western_alr_cat)3          0.0263375  0.0187988   1.401  0.16189    
## as.factor(Western_alr_cat)4          0.0152098  0.0187574   0.811  0.41787    
## as.factor(Eastern_alr_cat)2         -0.0014524  0.0148459  -0.098  0.92211    
## as.factor(Eastern_alr_cat)3          0.0173089  0.0153854   1.125  0.26118    
## as.factor(Southeastern_alr_cat)2     0.0123161  0.0124088   0.993  0.32147    
## as.factor(Southeastern_alr_cat)3    -0.0021354  0.0127954  -0.167  0.86754    
## Intro101                            -0.0104528  0.0095026  -1.100  0.27192    
## patient_sexfemale                   -0.0009079  0.0110412  -0.082  0.93450    
## as.factor(age_cat)2                  0.0010626  0.0106529   0.100  0.92059    
## as.factor(age_cat)3                 -0.0006036  0.0136362  -0.044  0.96471    
## as.factor(age_cat)4                  0.0156984  0.0179898   0.873  0.38333    
## TB_RF_smokingyes                    -0.0186220  0.0116304  -1.601  0.11004    
## as.factor(cough_cat)2               -0.0405017  0.0131832  -3.072  0.00225 ** 
## as.factor(cough_cat)3               -0.0396298  0.0105327  -3.763  0.00019 ***
## socio_cat2                           0.0018717  0.0100815   0.186  0.85279    
## socio_cat3                           0.0074621  0.0142738   0.523  0.60138    
## general_examination_malnutritionyes  0.0059837  0.0178178   0.336  0.73716    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.009488554)
## 
##     Null deviance: 4.6156  on 471  degrees of freedom
## Residual deviance: 4.2983  on 453  degrees of freedom
## AIC: -838.33
## 
## Number of Fisher Scoring iterations: 2
library(lmtest)

# socioeconomic status
# for the ancestries as category
model1 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking + as.factor(cough_cat) + socio_cat + general_examination_malnutrition + TB_category, data = clean, na.action = na.exclude, family = "gaussian")

summary(model1)
## 
## Call:
## glm(formula = log10(Ct_value) ~ as.factor(Western_alr_cat) + 
##     as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + 
##     Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking + 
##     as.factor(cough_cat) + socio_cat + general_examination_malnutrition + 
##     TB_category, family = "gaussian", data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          1.2702105  0.0244139  52.028  < 2e-16 ***
## as.factor(Western_alr_cat)2          0.0273058  0.0208573   1.309 0.191142    
## as.factor(Western_alr_cat)3          0.0261936  0.0188430   1.390 0.165183    
## as.factor(Western_alr_cat)4          0.0151269  0.0187857   0.805 0.421108    
## as.factor(Eastern_alr_cat)2         -0.0015469  0.0148750  -0.104 0.917222    
## as.factor(Eastern_alr_cat)3          0.0173400  0.0154034   1.126 0.260879    
## as.factor(Southeastern_alr_cat)2     0.0122626  0.0124272   0.987 0.324289    
## as.factor(Southeastern_alr_cat)3    -0.0022721  0.0128410  -0.177 0.859632    
## Intro101                            -0.0105080  0.0095199  -1.104 0.270271    
## patient_sexfemale                   -0.0008919  0.0110536  -0.081 0.935728    
## as.factor(age_cat)2                  0.0010334  0.0106661   0.097 0.922857    
## as.factor(age_cat)3                 -0.0006048  0.0136510  -0.044 0.964679    
## as.factor(age_cat)4                  0.0154763  0.0180688   0.857 0.392163    
## TB_RF_smokingyes                    -0.0186900  0.0116516  -1.604 0.109397    
## as.factor(cough_cat)2               -0.0404355  0.0132046  -3.062 0.002328 ** 
## as.factor(cough_cat)3               -0.0396875  0.0105510  -3.761 0.000191 ***
## socio_cat2                           0.0018660  0.0100925   0.185 0.853399    
## socio_cat3                           0.0074870  0.0142902   0.524 0.600585    
## general_examination_malnutritionyes  0.0060648  0.0178451   0.340 0.734120    
## TB_categoryrelapse                   0.0053686  0.0354307   0.152 0.879630    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.009509063)
## 
##     Null deviance: 4.6156  on 471  degrees of freedom
## Residual deviance: 4.2981  on 452  degrees of freedom
## AIC: -836.36
## 
## Number of Fisher Scoring iterations: 2
library(lmtest)

# socioeconomic status
# for the ancestries as category
model1 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking + as.factor(cough_cat) + socio_cat + general_examination_malnutrition + TB_category + Education, data = clean, na.action = na.exclude, family = "gaussian")

summary(model1)
## 
## Call:
## glm(formula = log10(Ct_value) ~ as.factor(Western_alr_cat) + 
##     as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + 
##     Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking + 
##     as.factor(cough_cat) + socio_cat + general_examination_malnutrition + 
##     TB_category + Education, family = "gaussian", data = clean, 
##     na.action = na.exclude)
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          1.2702072  0.0247740  51.272  < 2e-16 ***
## as.factor(Western_alr_cat)2          0.0270033  0.0209049   1.292 0.197119    
## as.factor(Western_alr_cat)3          0.0261484  0.0188619   1.386 0.166342    
## as.factor(Western_alr_cat)4          0.0151208  0.0188271   0.803 0.422318    
## as.factor(Eastern_alr_cat)2         -0.0013482  0.0148639  -0.091 0.927771    
## as.factor(Eastern_alr_cat)3          0.0162746  0.0154108   1.056 0.291512    
## as.factor(Southeastern_alr_cat)2     0.0122515  0.0124475   0.984 0.325518    
## as.factor(Southeastern_alr_cat)3    -0.0009383  0.0129520  -0.072 0.942283    
## Intro101                            -0.0096428  0.0095276  -1.012 0.312038    
## patient_sexfemale                   -0.0008778  0.0111061  -0.079 0.937041    
## as.factor(age_cat)2                  0.0039443  0.0112296   0.351 0.725571    
## as.factor(age_cat)3                  0.0010962  0.0139612   0.079 0.937449    
## as.factor(age_cat)4                  0.0175929  0.0182754   0.963 0.336238    
## TB_RF_smokingyes                    -0.0159454  0.0117503  -1.357 0.175456    
## as.factor(cough_cat)2               -0.0396451  0.0132018  -3.003 0.002823 ** 
## as.factor(cough_cat)3               -0.0391892  0.0105471  -3.716 0.000228 ***
## socio_cat2                           0.0010120  0.0101636   0.100 0.920730    
## socio_cat3                           0.0038285  0.0144479   0.265 0.791140    
## general_examination_malnutritionyes  0.0102834  0.0179618   0.573 0.567261    
## TB_categoryrelapse                   0.0049328  0.0354320   0.139 0.889340    
## Educationno                         -0.0239194  0.0142524  -1.678 0.093989 .  
## Educationsecondary                  -0.0019552  0.0123818  -0.158 0.874601    
## Educationuniversity                  0.0188758  0.0219645   0.859 0.390592    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.009491995)
## 
##     Null deviance: 4.6156  on 471  degrees of freedom
## Residual deviance: 4.2619  on 449  degrees of freedom
## AIC: -834.35
## 
## Number of Fisher Scoring iterations: 2
library(lmtest)

# socioeconomic status
# for the ancestries as category
model1 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking + as.factor(cough_cat) + socio_cat + general_examination_malnutrition + TB_category + Education + DR_category, data = clean, na.action = na.exclude, family = "gaussian")

summary(model1)
## 
## Call:
## glm(formula = log10(Ct_value) ~ as.factor(Western_alr_cat) + 
##     as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + 
##     Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking + 
##     as.factor(cough_cat) + socio_cat + general_examination_malnutrition + 
##     TB_category + Education + DR_category, family = "gaussian", 
##     data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          1.2701163  0.0248051  51.204  < 2e-16 ***
## as.factor(Western_alr_cat)2          0.0271249  0.0209368   1.296 0.195795    
## as.factor(Western_alr_cat)3          0.0260723  0.0188863   1.380 0.168124    
## as.factor(Western_alr_cat)4          0.0150857  0.0188482   0.800 0.423918    
## as.factor(Eastern_alr_cat)2         -0.0013131  0.0148809  -0.088 0.929723    
## as.factor(Eastern_alr_cat)3          0.0162548  0.0154277   1.054 0.292628    
## as.factor(Southeastern_alr_cat)2     0.0124595  0.0125072   0.996 0.319699    
## as.factor(Southeastern_alr_cat)3    -0.0008475  0.0129744  -0.065 0.947949    
## Intro101                            -0.0096538  0.0095380  -1.012 0.312014    
## patient_sexfemale                   -0.0008534  0.0111187  -0.077 0.938856    
## as.factor(age_cat)2                  0.0040954  0.0112688   0.363 0.716457    
## as.factor(age_cat)3                  0.0012880  0.0140114   0.092 0.926800    
## as.factor(age_cat)4                  0.0178601  0.0183472   0.973 0.330854    
## TB_RF_smokingyes                    -0.0161377  0.0118049  -1.367 0.172301    
## as.factor(cough_cat)2               -0.0395393  0.0132273  -2.989 0.002951 ** 
## as.factor(cough_cat)3               -0.0391286  0.0105631  -3.704 0.000238 ***
## socio_cat2                           0.0011549  0.0102014   0.113 0.909912    
## socio_cat3                           0.0038470  0.0144637   0.266 0.790379    
## general_examination_malnutritionyes  0.0101227  0.0180003   0.562 0.574150    
## TB_categoryrelapse                   0.0047640  0.0354808   0.134 0.893251    
## Educationno                         -0.0239086  0.0142678  -1.676 0.094494 .  
## Educationsecondary                  -0.0019037  0.0123979  -0.154 0.878037    
## Educationuniversity                  0.0188650  0.0219882   0.858 0.391372    
## DR_categoryresistant                -0.0050409  0.0260864  -0.193 0.846860    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.009512389)
## 
##     Null deviance: 4.6156  on 471  degrees of freedom
## Residual deviance: 4.2616  on 448  degrees of freedom
## AIC: -832.39
## 
## Number of Fisher Scoring iterations: 2

test, whether a model including the ancestries is better than a model without human ancestry

all ancestries

add more covariates

library(lmtest)

# for the ancestries as category
model1 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking + as.factor(cough_cat) + socio_cat + TB_category + general_examination_malnutrition + Education + DR_category, data = clean, na.action = na.exclude, family = "gaussian")
model2 <- glm(log10(Ct_value) ~ Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking + as.factor(cough_cat) + socio_cat+ TB_category + general_examination_malnutrition + Education +DR_category, data = clean, na.action = na.exclude, family = "gaussian")
lrtest(model1, model2)
## Likelihood ratio test
## 
## Model 1: log10(Ct_value) ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + patient_sex + 
##     as.factor(age_cat) + TB_RF_smoking + as.factor(cough_cat) + 
##     socio_cat + TB_category + general_examination_malnutrition + 
##     Education + DR_category
## Model 2: log10(Ct_value) ~ Intro10 + patient_sex + as.factor(age_cat) + 
##     TB_RF_smoking + as.factor(cough_cat) + socio_cat + TB_category + 
##     general_examination_malnutrition + Education + DR_category
##   #Df LogLik Df  Chisq Pr(>Chisq)
## 1  25 441.19                     
## 2  18 436.80 -7 8.7901     0.2681
summary(model1)
## 
## Call:
## glm(formula = log10(Ct_value) ~ as.factor(Western_alr_cat) + 
##     as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + 
##     Intro10 + patient_sex + as.factor(age_cat) + TB_RF_smoking + 
##     as.factor(cough_cat) + socio_cat + TB_category + general_examination_malnutrition + 
##     Education + DR_category, family = "gaussian", data = clean, 
##     na.action = na.exclude)
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          1.2701163  0.0248051  51.204  < 2e-16 ***
## as.factor(Western_alr_cat)2          0.0271249  0.0209368   1.296 0.195795    
## as.factor(Western_alr_cat)3          0.0260723  0.0188863   1.380 0.168124    
## as.factor(Western_alr_cat)4          0.0150857  0.0188482   0.800 0.423918    
## as.factor(Eastern_alr_cat)2         -0.0013131  0.0148809  -0.088 0.929723    
## as.factor(Eastern_alr_cat)3          0.0162548  0.0154277   1.054 0.292628    
## as.factor(Southeastern_alr_cat)2     0.0124595  0.0125072   0.996 0.319699    
## as.factor(Southeastern_alr_cat)3    -0.0008475  0.0129744  -0.065 0.947949    
## Intro101                            -0.0096538  0.0095380  -1.012 0.312014    
## patient_sexfemale                   -0.0008534  0.0111187  -0.077 0.938856    
## as.factor(age_cat)2                  0.0040954  0.0112688   0.363 0.716457    
## as.factor(age_cat)3                  0.0012880  0.0140114   0.092 0.926800    
## as.factor(age_cat)4                  0.0178601  0.0183472   0.973 0.330854    
## TB_RF_smokingyes                    -0.0161377  0.0118049  -1.367 0.172301    
## as.factor(cough_cat)2               -0.0395393  0.0132273  -2.989 0.002951 ** 
## as.factor(cough_cat)3               -0.0391286  0.0105631  -3.704 0.000238 ***
## socio_cat2                           0.0011549  0.0102014   0.113 0.909912    
## socio_cat3                           0.0038470  0.0144637   0.266 0.790379    
## TB_categoryrelapse                   0.0047640  0.0354808   0.134 0.893251    
## general_examination_malnutritionyes  0.0101227  0.0180003   0.562 0.574150    
## Educationno                         -0.0239086  0.0142678  -1.676 0.094494 .  
## Educationsecondary                  -0.0019037  0.0123979  -0.154 0.878037    
## Educationuniversity                  0.0188650  0.0219882   0.858 0.391372    
## DR_categoryresistant                -0.0050409  0.0260864  -0.193 0.846860    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.009512389)
## 
##     Null deviance: 4.6156  on 471  degrees of freedom
## Residual deviance: 4.2616  on 448  degrees of freedom
## AIC: -832.39
## 
## Number of Fisher Scoring iterations: 2
ORs <- exp(cbind(OR = coef(model1), confint(model1)))
## Waiting for profiling to be done...
round(ORs, digits = 2)
##                                       OR 2.5 % 97.5 %
## (Intercept)                         3.56  3.39   3.74
## as.factor(Western_alr_cat)2         1.03  0.99   1.07
## as.factor(Western_alr_cat)3         1.03  0.99   1.07
## as.factor(Western_alr_cat)4         1.02  0.98   1.05
## as.factor(Eastern_alr_cat)2         1.00  0.97   1.03
## as.factor(Eastern_alr_cat)3         1.02  0.99   1.05
## as.factor(Southeastern_alr_cat)2    1.01  0.99   1.04
## as.factor(Southeastern_alr_cat)3    1.00  0.97   1.02
## Intro101                            0.99  0.97   1.01
## patient_sexfemale                   1.00  0.98   1.02
## as.factor(age_cat)2                 1.00  0.98   1.03
## as.factor(age_cat)3                 1.00  0.97   1.03
## as.factor(age_cat)4                 1.02  0.98   1.06
## TB_RF_smokingyes                    0.98  0.96   1.01
## as.factor(cough_cat)2               0.96  0.94   0.99
## as.factor(cough_cat)3               0.96  0.94   0.98
## socio_cat2                          1.00  0.98   1.02
## socio_cat3                          1.00  0.98   1.03
## TB_categoryrelapse                  1.00  0.94   1.08
## general_examination_malnutritionyes 1.01  0.98   1.05
## Educationno                         0.98  0.95   1.00
## Educationsecondary                  1.00  0.97   1.02
## Educationuniversity                 1.02  0.98   1.06
## DR_categoryresistant                0.99  0.95   1.05

add the interaction

with more covariates

library(lmtest)

# cough duration
# for the ancestries as category
model2 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + Intro10 + patient_sex + age_cat + TB_RF_smoking + as.factor(cough_cat) + socio_cat + TB_category + general_examination_malnutrition + Education + DR_category, data = clean, na.action = na.exclude, family = "gaussian")
model1 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)*Intro10+ as.factor(Eastern_alr_cat)*Intro10 + as.factor(Southeastern_alr_cat)*Intro10 + patient_sex + age_cat + TB_RF_smoking + as.factor(cough_cat) + socio_cat+ TB_category + general_examination_malnutrition + Education + DR_category, data = clean, na.action = na.exclude, family = "gaussian")

summary(model1)
## 
## Call:
## glm(formula = log10(Ct_value) ~ as.factor(Western_alr_cat) * 
##     Intro10 + as.factor(Eastern_alr_cat) * Intro10 + as.factor(Southeastern_alr_cat) * 
##     Intro10 + patient_sex + age_cat + TB_RF_smoking + as.factor(cough_cat) + 
##     socio_cat + TB_category + general_examination_malnutrition + 
##     Education + DR_category, family = "gaussian", data = clean, 
##     na.action = na.exclude)
## 
## Coefficients:
##                                             Estimate Std. Error t value
## (Intercept)                                1.2610305  0.0299844  42.056
## as.factor(Western_alr_cat)2                0.0342423  0.0270854   1.264
## as.factor(Western_alr_cat)3                0.0323883  0.0245434   1.320
## as.factor(Western_alr_cat)4                0.0293651  0.0245371   1.197
## Intro101                                   0.0140718  0.0424874   0.331
## as.factor(Eastern_alr_cat)2               -0.0070836  0.0181736  -0.390
## as.factor(Eastern_alr_cat)3                0.0153460  0.0194231   0.790
## as.factor(Southeastern_alr_cat)2           0.0200084  0.0157656   1.269
## as.factor(Southeastern_alr_cat)3           0.0006993  0.0163213   0.043
## patient_sexfemale                         -0.0024662  0.0112616  -0.219
## age_cat2                                   0.0041994  0.0114221   0.368
## age_cat3                                   0.0010683  0.0141891   0.075
## age_cat4                                   0.0168342  0.0185634   0.907
## TB_RF_smokingyes                          -0.0175783  0.0119825  -1.467
## as.factor(cough_cat)2                     -0.0399688  0.0133557  -2.993
## as.factor(cough_cat)3                     -0.0387184  0.0106419  -3.638
## socio_cat2                                 0.0013931  0.0103017   0.135
## socio_cat3                                 0.0064851  0.0147087   0.441
## TB_categoryrelapse                        -0.0018087  0.0360697  -0.050
## general_examination_malnutritionyes        0.0124321  0.0181803   0.684
## Educationno                               -0.0252910  0.0145431  -1.739
## Educationsecondary                        -0.0035739  0.0125610  -0.285
## Educationuniversity                        0.0195683  0.0221869   0.882
## DR_categoryresistant                      -0.0059354  0.0263334  -0.225
## as.factor(Western_alr_cat)2:Intro101      -0.0206197  0.0430957  -0.478
## as.factor(Western_alr_cat)3:Intro101      -0.0144263  0.0390595  -0.369
## as.factor(Western_alr_cat)4:Intro101      -0.0379369  0.0391153  -0.970
## Intro101:as.factor(Eastern_alr_cat)2       0.0224824  0.0325223   0.691
## Intro101:as.factor(Eastern_alr_cat)3       0.0062283  0.0329512   0.189
## Intro101:as.factor(Southeastern_alr_cat)2 -0.0223865  0.0264205  -0.847
## Intro101:as.factor(Southeastern_alr_cat)3 -0.0086008  0.0273516  -0.314
##                                           Pr(>|t|)    
## (Intercept)                                < 2e-16 ***
## as.factor(Western_alr_cat)2               0.206815    
## as.factor(Western_alr_cat)3               0.187641    
## as.factor(Western_alr_cat)4               0.232041    
## Intro101                                  0.740651    
## as.factor(Eastern_alr_cat)2               0.696891    
## as.factor(Eastern_alr_cat)3               0.429900    
## as.factor(Southeastern_alr_cat)2          0.205067    
## as.factor(Southeastern_alr_cat)3          0.965845    
## patient_sexfemale                         0.826757    
## age_cat2                                  0.713304    
## age_cat3                                  0.940017    
## age_cat4                                  0.364982    
## TB_RF_smokingyes                          0.143089    
## as.factor(cough_cat)2                     0.002921 ** 
## as.factor(cough_cat)3                     0.000307 ***
## socio_cat2                                0.892494    
## socio_cat3                                0.659498    
## TB_categoryrelapse                        0.960030    
## general_examination_malnutritionyes       0.494447    
## Educationno                               0.082725 .  
## Educationsecondary                        0.776140    
## Educationuniversity                       0.378270    
## DR_categoryresistant                      0.821777    
## as.factor(Western_alr_cat)2:Intro101      0.632558    
## as.factor(Western_alr_cat)3:Intro101      0.712050    
## as.factor(Western_alr_cat)4:Intro101      0.332641    
## Intro101:as.factor(Eastern_alr_cat)2      0.489747    
## Intro101:as.factor(Eastern_alr_cat)3      0.850167    
## Intro101:as.factor(Southeastern_alr_cat)2 0.397278    
## Intro101:as.factor(Southeastern_alr_cat)3 0.753325    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.009593273)
## 
##     Null deviance: 4.6156  on 471  degrees of freedom
## Residual deviance: 4.2306  on 441  degrees of freedom
## AIC: -821.83
## 
## Number of Fisher Scoring iterations: 2
ORs <- exp(cbind(OR = coef(model1), confint(model1)))
## Waiting for profiling to be done...
round(ORs, digits = 2)
##                                             OR 2.5 % 97.5 %
## (Intercept)                               3.53  3.33   3.74
## as.factor(Western_alr_cat)2               1.03  0.98   1.09
## as.factor(Western_alr_cat)3               1.03  0.98   1.08
## as.factor(Western_alr_cat)4               1.03  0.98   1.08
## Intro101                                  1.01  0.93   1.10
## as.factor(Eastern_alr_cat)2               0.99  0.96   1.03
## as.factor(Eastern_alr_cat)3               1.02  0.98   1.05
## as.factor(Southeastern_alr_cat)2          1.02  0.99   1.05
## as.factor(Southeastern_alr_cat)3          1.00  0.97   1.03
## patient_sexfemale                         1.00  0.98   1.02
## age_cat2                                  1.00  0.98   1.03
## age_cat3                                  1.00  0.97   1.03
## age_cat4                                  1.02  0.98   1.05
## TB_RF_smokingyes                          0.98  0.96   1.01
## as.factor(cough_cat)2                     0.96  0.94   0.99
## as.factor(cough_cat)3                     0.96  0.94   0.98
## socio_cat2                                1.00  0.98   1.02
## socio_cat3                                1.01  0.98   1.04
## TB_categoryrelapse                        1.00  0.93   1.07
## general_examination_malnutritionyes       1.01  0.98   1.05
## Educationno                               0.98  0.95   1.00
## Educationsecondary                        1.00  0.97   1.02
## Educationuniversity                       1.02  0.98   1.07
## DR_categoryresistant                      0.99  0.94   1.05
## as.factor(Western_alr_cat)2:Intro101      0.98  0.90   1.07
## as.factor(Western_alr_cat)3:Intro101      0.99  0.91   1.06
## as.factor(Western_alr_cat)4:Intro101      0.96  0.89   1.04
## Intro101:as.factor(Eastern_alr_cat)2      1.02  0.96   1.09
## Intro101:as.factor(Eastern_alr_cat)3      1.01  0.94   1.07
## Intro101:as.factor(Southeastern_alr_cat)2 0.98  0.93   1.03
## Intro101:as.factor(Southeastern_alr_cat)3 0.99  0.94   1.05
lrtest(model1, model2)
## Likelihood ratio test
## 
## Model 1: log10(Ct_value) ~ as.factor(Western_alr_cat) * Intro10 + as.factor(Eastern_alr_cat) * 
##     Intro10 + as.factor(Southeastern_alr_cat) * Intro10 + patient_sex + 
##     age_cat + TB_RF_smoking + as.factor(cough_cat) + socio_cat + 
##     TB_category + general_examination_malnutrition + Education + 
##     DR_category
## Model 2: log10(Ct_value) ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + patient_sex + 
##     age_cat + TB_RF_smoking + as.factor(cough_cat) + socio_cat + 
##     TB_category + general_examination_malnutrition + Education + 
##     DR_category
##   #Df LogLik Df  Chisq Pr(>Chisq)
## 1  32 442.91                     
## 2  25 441.19 -7 3.4368     0.8419

does the removal of the cough duration variable change the results?

–> no it does not

library(lmtest)

# cough duration
# for the ancestries as category
model2 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)+ as.factor(Eastern_alr_cat) + as.factor(Southeastern_alr_cat) + Intro10 + patient_sex + age_cat + TB_RF_smoking + socio_cat + TB_category + general_examination_malnutrition + Education + DR_category, data = clean, na.action = na.exclude, family = "gaussian")
model1 <- glm(log10(Ct_value) ~ as.factor(Western_alr_cat)*Intro10+ as.factor(Eastern_alr_cat)*Intro10 + as.factor(Southeastern_alr_cat)*Intro10 + patient_sex + age_cat + TB_RF_smoking + socio_cat+ TB_category + general_examination_malnutrition + Education + DR_category, data = clean, na.action = na.exclude, family = "gaussian")

summary(model1)
## 
## Call:
## glm(formula = log10(Ct_value) ~ as.factor(Western_alr_cat) * 
##     Intro10 + as.factor(Eastern_alr_cat) * Intro10 + as.factor(Southeastern_alr_cat) * 
##     Intro10 + patient_sex + age_cat + TB_RF_smoking + socio_cat + 
##     TB_category + general_examination_malnutrition + Education + 
##     DR_category, family = "gaussian", data = clean, na.action = na.exclude)
## 
## Coefficients:
##                                             Estimate Std. Error t value
## (Intercept)                                1.2297527  0.0292959  41.977
## as.factor(Western_alr_cat)2                0.0381524  0.0274632   1.389
## as.factor(Western_alr_cat)3                0.0344074  0.0248953   1.382
## as.factor(Western_alr_cat)4                0.0332125  0.0248690   1.335
## Intro101                                   0.0055354  0.0430309   0.129
## as.factor(Eastern_alr_cat)2               -0.0084508  0.0184258  -0.459
## as.factor(Eastern_alr_cat)3                0.0170052  0.0196426   0.866
## as.factor(Southeastern_alr_cat)2           0.0240648  0.0159530   1.508
## as.factor(Southeastern_alr_cat)3           0.0054801  0.0165073   0.332
## patient_sexfemale                         -0.0018551  0.0114244  -0.162
## age_cat2                                   0.0038203  0.0115821   0.330
## age_cat3                                   0.0001803  0.0143961   0.013
## age_cat4                                   0.0144029  0.0188256   0.765
## TB_RF_smokingyes                          -0.0152843  0.0121339  -1.260
## socio_cat2                                 0.0003932  0.0104457   0.038
## socio_cat3                                 0.0075365  0.0148911   0.506
## TB_categoryrelapse                        -0.0044658  0.0364981  -0.122
## general_examination_malnutritionyes        0.0113124  0.0184326   0.614
## Educationno                               -0.0266781  0.0147495  -1.809
## Educationsecondary                        -0.0047713  0.0127420  -0.374
## Educationuniversity                        0.0207055  0.0225085   0.920
## DR_categoryresistant                      -0.0094520  0.0267040  -0.354
## as.factor(Western_alr_cat)2:Intro101      -0.0152760  0.0436576  -0.350
## as.factor(Western_alr_cat)3:Intro101      -0.0063865  0.0395777  -0.161
## as.factor(Western_alr_cat)4:Intro101      -0.0331488  0.0396645  -0.836
## Intro101:as.factor(Eastern_alr_cat)2       0.0224089  0.0329516   0.680
## Intro101:as.factor(Eastern_alr_cat)3       0.0053902  0.0333137   0.162
## Intro101:as.factor(Southeastern_alr_cat)2 -0.0239459  0.0267920  -0.894
## Intro101:as.factor(Southeastern_alr_cat)3 -0.0113146  0.0277123  -0.408
##                                           Pr(>|t|)    
## (Intercept)                                 <2e-16 ***
## as.factor(Western_alr_cat)2                 0.1655    
## as.factor(Western_alr_cat)3                 0.1676    
## as.factor(Western_alr_cat)4                 0.1824    
## Intro101                                    0.8977    
## as.factor(Eastern_alr_cat)2                 0.6467    
## as.factor(Eastern_alr_cat)3                 0.3871    
## as.factor(Southeastern_alr_cat)2            0.1321    
## as.factor(Southeastern_alr_cat)3            0.7401    
## patient_sexfemale                           0.8711    
## age_cat2                                    0.7417    
## age_cat3                                    0.9900    
## age_cat4                                    0.4446    
## TB_RF_smokingyes                            0.2085    
## socio_cat2                                  0.9700    
## socio_cat3                                  0.6130    
## TB_categoryrelapse                          0.9027    
## general_examination_malnutritionyes         0.5397    
## Educationno                                 0.0712 .  
## Educationsecondary                          0.7082    
## Educationuniversity                         0.3581    
## DR_categoryresistant                        0.7235    
## as.factor(Western_alr_cat)2:Intro101        0.7266    
## as.factor(Western_alr_cat)3:Intro101        0.8719    
## as.factor(Western_alr_cat)4:Intro101        0.4038    
## Intro101:as.factor(Eastern_alr_cat)2        0.4968    
## Intro101:as.factor(Eastern_alr_cat)3        0.8715    
## Intro101:as.factor(Southeastern_alr_cat)2   0.3719    
## Intro101:as.factor(Southeastern_alr_cat)3   0.6833    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.009877818)
## 
##     Null deviance: 4.6156  on 471  degrees of freedom
## Residual deviance: 4.3759  on 443  degrees of freedom
## AIC: -809.89
## 
## Number of Fisher Scoring iterations: 2
ORs <- exp(cbind(OR = coef(model1), confint(model1)))
## Waiting for profiling to be done...
round(ORs, digits = 2)
##                                             OR 2.5 % 97.5 %
## (Intercept)                               3.42  3.23   3.62
## as.factor(Western_alr_cat)2               1.04  0.98   1.10
## as.factor(Western_alr_cat)3               1.04  0.99   1.09
## as.factor(Western_alr_cat)4               1.03  0.98   1.09
## Intro101                                  1.01  0.92   1.09
## as.factor(Eastern_alr_cat)2               0.99  0.96   1.03
## as.factor(Eastern_alr_cat)3               1.02  0.98   1.06
## as.factor(Southeastern_alr_cat)2          1.02  0.99   1.06
## as.factor(Southeastern_alr_cat)3          1.01  0.97   1.04
## patient_sexfemale                         1.00  0.98   1.02
## age_cat2                                  1.00  0.98   1.03
## age_cat3                                  1.00  0.97   1.03
## age_cat4                                  1.01  0.98   1.05
## TB_RF_smokingyes                          0.98  0.96   1.01
## socio_cat2                                1.00  0.98   1.02
## socio_cat3                                1.01  0.98   1.04
## TB_categoryrelapse                        1.00  0.93   1.07
## general_examination_malnutritionyes       1.01  0.98   1.05
## Educationno                               0.97  0.95   1.00
## Educationsecondary                        1.00  0.97   1.02
## Educationuniversity                       1.02  0.98   1.07
## DR_categoryresistant                      0.99  0.94   1.04
## as.factor(Western_alr_cat)2:Intro101      0.98  0.90   1.07
## as.factor(Western_alr_cat)3:Intro101      0.99  0.92   1.07
## as.factor(Western_alr_cat)4:Intro101      0.97  0.90   1.05
## Intro101:as.factor(Eastern_alr_cat)2      1.02  0.96   1.09
## Intro101:as.factor(Eastern_alr_cat)3      1.01  0.94   1.07
## Intro101:as.factor(Southeastern_alr_cat)2 0.98  0.93   1.03
## Intro101:as.factor(Southeastern_alr_cat)3 0.99  0.94   1.04
lrtest(model1, model2)
## Likelihood ratio test
## 
## Model 1: log10(Ct_value) ~ as.factor(Western_alr_cat) * Intro10 + as.factor(Eastern_alr_cat) * 
##     Intro10 + as.factor(Southeastern_alr_cat) * Intro10 + patient_sex + 
##     age_cat + TB_RF_smoking + socio_cat + TB_category + general_examination_malnutrition + 
##     Education + DR_category
## Model 2: log10(Ct_value) ~ as.factor(Western_alr_cat) + as.factor(Eastern_alr_cat) + 
##     as.factor(Southeastern_alr_cat) + Intro10 + patient_sex + 
##     age_cat + TB_RF_smoking + socio_cat + TB_category + general_examination_malnutrition + 
##     Education + DR_category
##   #Df LogLik Df  Chisq Pr(>Chisq)
## 1  30 434.95                     
## 2  23 433.19 -7 3.5211      0.833